Re: Global snapshots

Started by Andrey Lepikhovover 5 years ago56 messages

Andrey Lepikhov

a.lepikhov@postgrespro.ru

over 5 years ago

3 attachment(s)

Rebased onto current master (fb544735f1).

--
Andrey Lepikhov
Postgres Professional
https://postgrespro.com
The Russian Postgres Company

Attachments:

0001-GlobalCSNLog-SLRU-v3.patchtext/x-patch; charset=UTF-8; name=0001-GlobalCSNLog-SLRU-v3.patchDownload

From 29183c42a8ae31b830ab5af0dfcfdaadd6229700 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Tue, 12 May 2020 08:29:54 +0500
Subject: [PATCH 1/3] GlobalCSNLog-SLRU-v3

---
 src/backend/access/transam/Makefile         |   1 +
 src/backend/access/transam/global_csn_log.c | 439 ++++++++++++++++++++
 src/backend/access/transam/twophase.c       |   1 +
 src/backend/access/transam/varsup.c         |   2 +
 src/backend/access/transam/xlog.c           |  12 +
 src/backend/storage/ipc/ipci.c              |   3 +
 src/backend/storage/ipc/procarray.c         |   3 +
 src/backend/storage/lmgr/lwlocknames.txt    |   1 +
 src/backend/tcop/postgres.c                 |   1 +
 src/backend/utils/misc/guc.c                |   9 +
 src/backend/utils/probes.d                  |   2 +
 src/bin/initdb/initdb.c                     |   3 +-
 src/include/access/global_csn_log.h         |  30 ++
 src/include/storage/lwlock.h                |   1 +
 src/include/utils/snapshot.h                |   3 +
 15 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 src/backend/access/transam/global_csn_log.c
 create mode 100644 src/include/access/global_csn_log.h

diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de72..60ff8b141e 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	global_csn_log.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/global_csn_log.c b/src/backend/access/transam/global_csn_log.c
new file mode 100644
index 0000000000..6f7fded350
--- /dev/null
+++ b/src/backend/access/transam/global_csn_log.c
@@ -0,0 +1,439 @@
+/*-----------------------------------------------------------------------------
+ *
+ * global_csn_log.c
+ *		Track global commit sequence numbers of finished transactions
+ *
+ * Implementation of cross-node transaction isolation relies on commit sequence
+ * number (CSN) based visibility rules.  This module provides SLRU to store
+ * CSN for each transaction.  This mapping need to be kept only for xid's
+ * greater then oldestXid, but that can require arbitrary large amounts of
+ * memory in case of long-lived transactions.  Because of same lifetime and
+ * persistancy requirements this module is quite similar to subtrans.c
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/global_csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/global_csn_log.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+bool track_global_snapshots;
+
+/*
+ * Defines for GlobalCSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * GlobalCSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/GLOBAL_CSN_LOG_XACTS_PER_PAGE, and GlobalCSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateGlobalCSNLog (see GlobalCSNLogPagePrecedes).
+ */
+
+/* We store the commit GlobalCSN for each xid */
+#define GCSNLOG_XACTS_PER_PAGE (BLCKSZ / sizeof(GlobalCSN))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) GCSNLOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) GCSNLOG_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData GlobalCSNLogCtlData;
+#define GlobalCsnlogCtl (&GlobalCSNLogCtlData)
+
+static int	ZeroGlobalCSNLogPage(int pageno);
+static bool GlobalCSNLogPagePrecedes(int page1, int page2);
+static void GlobalCSNLogSetPageStatus(TransactionId xid, int nsubxids,
+									  TransactionId *subxids,
+									  GlobalCSN csn, int pageno);
+static void GlobalCSNLogSetCSNInSlot(TransactionId xid, GlobalCSN csn,
+									  int slotno);
+
+/*
+ * GlobalCSNLogSetCSN
+ *
+ * Record GlobalCSN of transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transactionid for a top level commit or abort. It can also be a
+ * subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * AbortedGlobalCSN for abort cases.
+ */
+void
+GlobalCSNLogSetCSN(TransactionId xid, int nsubxids,
+					 TransactionId *subxids, GlobalCSN csn)
+{
+	int			pageno;
+	int			i = 0;
+	int			offset = 0;
+
+	/* Callers of GlobalCSNLogSetCSN() must check GUC params */
+	Assert(track_global_snapshots);
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);		/* get page of parent */
+	for (;;)
+	{
+		int			num_on_page = 0;
+
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		GlobalCSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							csn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+GlobalCSNLogSetPageStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids,
+						   GlobalCSN csn, int pageno)
+{
+	int			slotno;
+	int			i;
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(GlobalCsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(GlobalCsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		GlobalCSNLogSetCSNInSlot(subxids[i],	csn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		GlobalCSNLogSetCSNInSlot(xid, csn, slotno);
+
+	GlobalCsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+GlobalCSNLogSetCSNInSlot(TransactionId xid, GlobalCSN csn, int slotno)
+{
+	int			entryno = TransactionIdToPgIndex(xid);
+	GlobalCSN *ptr;
+
+	Assert(LWLockHeldByMe(GlobalCSNLogControlLock));
+
+	ptr = (GlobalCSN *) (GlobalCsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetGlobalCSN() in global_snapshot.c is the
+ * intended caller.
+ */
+GlobalCSN
+GlobalCSNLogGetCSN(TransactionId xid)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToPgIndex(xid);
+	int			slotno;
+	GlobalCSN *ptr;
+	GlobalCSN	global_csn;
+
+	/* Callers of GlobalCSNLogGetCSN() must check GUC params */
+	Assert(track_global_snapshots);
+
+	/* Can't ask about stuff that might not be around anymore */
+	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+	slotno = SimpleLruReadPage_ReadOnly(GlobalCsnlogCtl, pageno, xid);
+	ptr = (GlobalCSN *) (GlobalCsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+	global_csn = *ptr;
+
+	LWLockRelease(GlobalCSNLogControlLock);
+
+	return global_csn;
+}
+
+/*
+ * Number of shared GlobalCSNLog buffers.
+ */
+static Size
+GlobalCSNLogShmemBuffers(void)
+{
+	return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for GlobalCsnlogCtl.
+ */
+Size
+GlobalCSNLogShmemSize(void)
+{
+	if (!track_global_snapshots)
+		return 0;
+
+	return SimpleLruShmemSize(GlobalCSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for GlobalCSNLog.
+ */
+void
+GlobalCSNLogShmemInit(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	GlobalCsnlogCtl->PagePrecedes = GlobalCSNLogPagePrecedes;
+	SimpleLruInit(GlobalCsnlogCtl, "GlobalCSNLog Ctl", GlobalCSNLogShmemBuffers(), 0,
+				  GlobalCSNLogControlLock, "pg_global_csn", LWTRANCHE_GLOBAL_CSN_LOG_BUFFERS);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates the initial
+ * GlobalCSNLog segment.  The pg_global_csn directory is assumed to have been
+ * created by initdb, and GlobalCSNLogShmemInit must have been called already.
+ */
+void
+BootStrapGlobalCSNLog(void)
+{
+	int			slotno;
+
+	if (!track_global_snapshots)
+		return;
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the commit log */
+	slotno = ZeroGlobalCSNLogPage(0);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(GlobalCsnlogCtl, slotno);
+	Assert(!GlobalCsnlogCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of GlobalCSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroGlobalCSNLogPage(int pageno)
+{
+	Assert(LWLockHeldByMe(GlobalCSNLogControlLock));
+	return SimpleLruZeroPage(GlobalCsnlogCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ */
+void
+StartupGlobalCSNLog(TransactionId oldestActiveXID)
+{
+	int			startPage;
+	int			endPage;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Since we don't expect pg_global_csn to be valid across crashes, we
+	 * initialize the currently-active page(s) to zeroes during startup.
+	 * Whenever we advance into a new page, ExtendGlobalCSNLog will likewise
+	 * zero the new page without regard to whatever was previously on disk.
+	 */
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	startPage = TransactionIdToPage(oldestActiveXID);
+	endPage = TransactionIdToPage(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
+
+	while (startPage != endPage)
+	{
+		(void) ZeroGlobalCSNLogPage(startPage);
+		startPage++;
+		/* must account for wraparound */
+		if (startPage > TransactionIdToPage(MaxTransactionId))
+			startPage = 0;
+	}
+	(void) ZeroGlobalCSNLogPage(startPage);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownGlobalCSNLog(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Flush dirty GlobalCSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely as a debugging aid.
+	 */
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_START(false);
+	SimpleLruFlush(GlobalCsnlogCtl, false);
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_DONE(false);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointGlobalCSNLog(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Flush dirty GlobalCSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_START(true);
+	SimpleLruFlush(GlobalCsnlogCtl, true);
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that GlobalCSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendGlobalCSNLog(TransactionId newestXact)
+{
+	int			pageno;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroGlobalCSNLogPage(pageno);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Remove all GlobalCSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateGlobalCSNLog(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	SimpleLruTruncate(GlobalCsnlogCtl, cutoffPage);
+}
+
+/*
+ * Decide which of two GlobalCSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+GlobalCSNLogPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * GCSNLOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * GCSNLOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 2f7d4ed59a..0ecc02a3dd 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 2570e7086a..882bc66825 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -173,6 +174,7 @@ GetNewTransactionId(bool isSubXact)
 	 * Extend pg_subtrans and pg_commit_ts too.
 	 */
 	ExtendCLOG(xid);
+	ExtendGlobalCSNLog(xid);
 	ExtendCommitTs(xid);
 	ExtendSUBTRANS(xid);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d3d670928..285d9d442e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -5345,6 +5346,7 @@ BootStrapXLOG(void)
 
 	/* Bootstrap the commit log, too */
 	BootStrapCLOG();
+	BootStrapGlobalCSNLog();
 	BootStrapCommitTs();
 	BootStrapSUBTRANS();
 	BootStrapMultiXact();
@@ -7054,6 +7056,7 @@ StartupXLOG(void)
 			 * maintained during recovery and need not be started yet.
 			 */
 			StartupCLOG();
+			StartupGlobalCSNLog(oldestActiveXID);
 			StartupSUBTRANS(oldestActiveXID);
 
 			/*
@@ -7871,6 +7874,7 @@ StartupXLOG(void)
 	if (standbyState == STANDBY_DISABLED)
 	{
 		StartupCLOG();
+		StartupGlobalCSNLog(oldestActiveXID);
 		StartupSUBTRANS(oldestActiveXID);
 	}
 
@@ -8518,6 +8522,7 @@ ShutdownXLOG(int code, Datum arg)
 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	}
 	ShutdownCLOG();
+	ShutdownGlobalCSNLog();
 	ShutdownCommitTs();
 	ShutdownSUBTRANS();
 	ShutdownMultiXact();
@@ -9090,7 +9095,10 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
+	{
 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateGlobalCSNLog(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+	}
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -9166,6 +9174,7 @@ static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointCLOG();
+	CheckPointGlobalCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -9450,7 +9459,10 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
+	{
 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateGlobalCSNLog(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+	}
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 427b0d59cd..dc2d2959c4 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -125,6 +126,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, ProcGlobalShmemSize());
 		size = add_size(size, XLOGShmemSize());
 		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, GlobalCSNLogShmemSize());
 		size = add_size(size, CommitTsShmemSize());
 		size = add_size(size, SUBTRANSShmemSize());
 		size = add_size(size, TwoPhaseShmemSize());
@@ -213,6 +215,7 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	GlobalCSNLogShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 363000670b..8ae4906474 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -46,6 +46,7 @@
 #include <signal.h>
 
 #include "access/clog.h"
+#include "access/global_csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -835,6 +836,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
 	{
 		ExtendSUBTRANS(latestObservedXid);
+		ExtendGlobalCSNLog(latestObservedXid);
 		TransactionIdAdvance(latestObservedXid);
 	}
 	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
@@ -3337,6 +3339,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
+			ExtendGlobalCSNLog(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index db47843229..fe18c93b61 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -49,3 +49,4 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 CLogTruncationLock					44
+GlobalCSNLogControlLock				45
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 802b1ec22f..d0bb64870c 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -42,6 +42,7 @@
 #include "catalog/pg_type.h"
 #include "commands/async.h"
 #include "commands/prepare.h"
+#include "common/hashfn.h"
 #include "executor/spi.h"
 #include "jit/jit.h"
 #include "libpq/libpq.h"
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 53a6cd2436..0ca331c6f9 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1175,6 +1175,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_global_snapshots", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable global snapshot tracking."),
+			gettext_noop("Used to achieve REPEATEBLE READ isolation level for postgres_fdw transactions.")
+		},
+		&track_global_snapshots,
+		true, /* XXX: set true to simplify tesing. XXX2: Seems that RESOURCES_MEM isn't the best catagory */
+		NULL, NULL, NULL
+	},
 	{
 		{"ssl", PGC_SIGHUP, CONN_AUTH_SSL,
 			gettext_noop("Enables SSL connections."),
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index a0b0458108..f900e7f3b4 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe globalcsnlog__checkpoint__start(bool);
+	probe globalcsnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index a6577486ce..d0afab9d33 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -220,7 +220,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_global_csn"
 };
 
 
diff --git a/src/include/access/global_csn_log.h b/src/include/access/global_csn_log.h
new file mode 100644
index 0000000000..417c26c8a3
--- /dev/null
+++ b/src/include/access/global_csn_log.h
@@ -0,0 +1,30 @@
+/*
+ * global_csn_log.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/global_csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "utils/snapshot.h"
+
+extern void GlobalCSNLogSetCSN(TransactionId xid, int nsubxids,
+							   TransactionId *subxids, GlobalCSN csn);
+extern GlobalCSN GlobalCSNLogGetCSN(TransactionId xid);
+
+extern Size GlobalCSNLogShmemSize(void);
+extern void GlobalCSNLogShmemInit(void);
+extern void BootStrapGlobalCSNLog(void);
+extern void StartupGlobalCSNLog(TransactionId oldestActiveXID);
+extern void ShutdownGlobalCSNLog(void);
+extern void CheckPointGlobalCSNLog(void);
+extern void ExtendGlobalCSNLog(TransactionId newestXact);
+extern void TruncateGlobalCSNLog(TransactionId oldestXact);
+
+#endif   /* CSNLOG_H */
\ No newline at end of file
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8fda8e4f78..c303042663 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -198,6 +198,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_CLOG_BUFFERS = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFERS,
 	LWTRANCHE_SUBTRANS_BUFFERS,
+	LWTRANCHE_GLOBAL_CSN_LOG_BUFFERS,
 	LWTRANCHE_MXACTOFFSET_BUFFERS,
 	LWTRANCHE_MXACTMEMBER_BUFFERS,
 	LWTRANCHE_ASYNC_BUFFERS,
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 4796edb63a..57d2dfaa67 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -20,6 +20,9 @@
 #include "storage/buf.h"
 
 
+typedef uint64 GlobalCSN;
+extern bool track_global_snapshots;
+
 /*
  * The different snapshot types.  We use SnapshotData structures to represent
  * both "regular" (MVCC) snapshots and "special" snapshots that have non-MVCC
-- 
2.17.1

0002-Global-snapshots-v3.patchtext/x-patch; charset=UTF-8; name=0002-Global-snapshots-v3.patchDownload

From 25a5288764e9e70a0a61a4a1b32111ce8b29c966 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Tue, 12 May 2020 08:30:46 +0500
Subject: [PATCH 2/3] Global-snapshots-v3

---
 src/backend/access/transam/Makefile           |   1 +
 src/backend/access/transam/global_snapshot.c  | 755 ++++++++++++++++++
 src/backend/access/transam/twophase.c         | 156 ++++
 src/backend/access/transam/xact.c             |  29 +
 src/backend/access/transam/xlog.c             |   2 +
 src/backend/storage/ipc/ipci.c                |   3 +
 src/backend/storage/ipc/procarray.c           |  92 ++-
 src/backend/storage/lmgr/lwlocknames.txt      |   1 +
 src/backend/storage/lmgr/proc.c               |   5 +
 src/backend/utils/misc/guc.c                  |  13 +-
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/backend/utils/time/snapmgr.c              | 167 +++-
 src/include/access/global_snapshot.h          |  72 ++
 src/include/access/twophase.h                 |   1 +
 src/include/catalog/pg_proc.dat               |  14 +
 src/include/datatype/timestamp.h              |   3 +
 src/include/fmgr.h                            |   1 +
 src/include/portability/instr_time.h          |  10 +
 src/include/storage/proc.h                    |  15 +
 src/include/storage/procarray.h               |   8 +
 src/include/utils/snapmgr.h                   |   3 +
 src/include/utils/snapshot.h                  |   8 +
 22 files changed, 1354 insertions(+), 7 deletions(-)
 create mode 100644 src/backend/access/transam/global_snapshot.c
 create mode 100644 src/include/access/global_snapshot.h

diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 60ff8b141e..6de567a79b 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	clog.o \
 	commit_ts.o \
 	global_csn_log.o \
+	global_snapshot.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/global_snapshot.c b/src/backend/access/transam/global_snapshot.c
new file mode 100644
index 0000000000..bac16828bb
--- /dev/null
+++ b/src/backend/access/transam/global_snapshot.c
@@ -0,0 +1,755 @@
+/*-------------------------------------------------------------------------
+ *
+ * global_snapshot.c
+ *		Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/global_snapshot.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/global_csn_log.h"
+#include "access/global_snapshot.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "portability/instr_time.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+#include "miscadmin.h"
+
+/* Raise a warning if imported global_csn exceeds ours by this value. */
+#define SNAP_DESYNC_COMPLAIN (1*NSECS_PER_SEC) /* 1 second */
+
+/*
+ * GlobalSnapshotState
+ *
+ * Do not trust local clocks to be strictly monotonical and save last acquired
+ * value so later we can compare next timestamp with it. Accessed through
+ * GlobalSnapshotGenerate() and GlobalSnapshotSync().
+ */
+typedef struct
+{
+	GlobalCSN		 last_global_csn;
+	volatile slock_t lock;
+} GlobalSnapshotState;
+
+static GlobalSnapshotState *gsState;
+
+
+/*
+ * GUC to delay advance of oldestXid for this amount of time. Also determines
+ * the size GlobalSnapshotXidMap circular buffer.
+ */
+int global_snapshot_defer_time;
+
+/*
+ * Enables this module.
+ */
+extern bool track_global_snapshots;
+
+/*
+ * GlobalSnapshotXidMap
+ *
+ * To be able to install global snapshot that points to past we need to keep
+ * old versions of tuples and therefore delay advance of oldestXid.  Here we
+ * keep track of correspondence between snapshot's global_csn and oldestXid
+ * that was set at the time when the snapshot was taken.  Much like the
+ * snapshot too old's OldSnapshotControlData does, but with finer granularity
+ * to seconds.
+ *
+ * Different strategies can be employed to hold oldestXid (e.g. we can track
+ * oldest global_csn-based snapshot among cluster nodes and map it oldestXid
+ * on each node) but here implemented one that tries to avoid cross-node
+ * communications which are tricky in case of postgres_fdw.
+ *
+ * On each snapshot acquisition GlobalSnapshotMapXmin() is called and stores
+ * correspondence between current global_csn and oldestXmin in a sparse way:
+ * global_csn is rounded to seconds (and here we use the fact that global_csn
+ * is just a timestamp) and oldestXmin is stored in the circular buffer where
+ * rounded global_csn acts as an offset from current circular buffer head.
+ * Size of the circular buffer is controlled by global_snapshot_defer_time GUC.
+ *
+ * When global snapshot arrives from different node we check that its
+ * global_csn is still in our map, otherwise we'll error out with "snapshot too
+ * old" message.  If global_csn is successfully mapped to oldestXid we move
+ * backend's pgxact->xmin to proc->originalXmin and fill pgxact->xmin to
+ * mapped oldestXid.  That way GetOldestXmin() can take into account backends
+ * with imported global snapshot and old tuple versions will be preserved.
+ *
+ * Also while calculating oldestXmin for our map in presence of imported
+ * global snapshots we should use proc->originalXmin instead of pgxact->xmin
+ * that was set during import.  Otherwise, we can create a feedback loop:
+ * xmin's of imported global snapshots were calculated using our map and new
+ * entries in map going to be calculated based on that xmin's, and there is
+ * a risk to stuck forever with one non-increasing oldestXmin.  All other
+ * callers of GetOldestXmin() are using pgxact->xmin so the old tuple versions
+ * are preserved.
+ */
+typedef struct GlobalSnapshotXidMap
+{
+	int				 head;				/* offset of current freshest value */
+	int				 size;				/* total size of circular buffer */
+	GlobalCSN_atomic last_csn_seconds;	/* last rounded global_csn that changed
+										 * xmin_by_second[] */
+	TransactionId   *xmin_by_second;	/* circular buffer of oldestXmin's */
+}
+GlobalSnapshotXidMap;
+
+static GlobalSnapshotXidMap *gsXidMap;
+
+
+/* Estimate shared memory space needed */
+Size
+GlobalSnapshotShmemSize(void)
+{
+	Size	size = 0;
+
+	if (track_global_snapshots || global_snapshot_defer_time > 0)
+	{
+		size += MAXALIGN(sizeof(GlobalSnapshotState));
+	}
+
+	if (global_snapshot_defer_time > 0)
+	{
+		size += sizeof(GlobalSnapshotXidMap);
+		size += global_snapshot_defer_time*sizeof(TransactionId);
+		size = MAXALIGN(size);
+	}
+
+	return size;
+}
+
+/* Init shared memory structures */
+void
+GlobalSnapshotShmemInit()
+{
+	bool found;
+
+	if (track_global_snapshots || global_snapshot_defer_time > 0)
+	{
+		gsState = ShmemInitStruct("gsState",
+								sizeof(GlobalSnapshotState),
+								&found);
+		if (!found)
+		{
+			gsState->last_global_csn = 0;
+			SpinLockInit(&gsState->lock);
+		}
+	}
+
+	if (global_snapshot_defer_time > 0)
+	{
+		gsXidMap = ShmemInitStruct("gsXidMap",
+								   sizeof(GlobalSnapshotXidMap),
+								   &found);
+		if (!found)
+		{
+			int i;
+
+			pg_atomic_init_u64(&gsXidMap->last_csn_seconds, 0);
+			gsXidMap->head = 0;
+			gsXidMap->size = global_snapshot_defer_time;
+			gsXidMap->xmin_by_second =
+							ShmemAlloc(sizeof(TransactionId)*gsXidMap->size);
+
+			for (i = 0; i < gsXidMap->size; i++)
+				gsXidMap->xmin_by_second[i] = InvalidTransactionId;
+		}
+	}
+}
+
+/*
+ * GlobalSnapshotStartup
+ *
+ * Set gsXidMap entries to oldestActiveXID during startup.
+ */
+void
+GlobalSnapshotStartup(TransactionId oldestActiveXID)
+{
+	/*
+	 * Run only if we have initialized shared memory and gsXidMap
+	 * is enabled.
+	 */
+	if (IsNormalProcessingMode() &&
+		track_global_snapshots && global_snapshot_defer_time > 0)
+	{
+		int i;
+
+		Assert(TransactionIdIsValid(oldestActiveXID));
+		for (i = 0; i < gsXidMap->size; i++)
+			gsXidMap->xmin_by_second[i] = oldestActiveXID;
+		ProcArraySetGlobalSnapshotXmin(oldestActiveXID);
+	}
+}
+
+/*
+ * GlobalSnapshotMapXmin
+ *
+ * Maintain circular buffer of oldestXmins for several seconds in past. This
+ * buffer allows to shift oldestXmin in the past when backend is importing
+ * global transaction. Otherwise old versions of tuples that were needed for
+ * this transaction can be recycled by other processes (vacuum, HOT, etc).
+ *
+ * Locking here is not trivial. Called upon each snapshot creation after
+ * ProcArrayLock is released. Such usage creates several race conditions. It
+ * is possible that backend who got global_csn called GlobalSnapshotMapXmin()
+ * only after other backends managed to get snapshot and complete
+ * GlobalSnapshotMapXmin() call, or even committed. This is safe because
+ *
+ *		* We already hold our xmin in MyPgXact, so our snapshot will not be
+ *		  harmed even though ProcArrayLock is released.
+ *
+ *		* snapshot_global_csn is always pessmistically rounded up to the next
+ *		  second.
+ *
+ *		* For performance reasons, xmin value for particular second is filled
+ *		  only once. Because of that instead of writing to buffer just our
+ *		  xmin (which is enough for our snapshot), we bump oldestXmin there --
+ *		  it mitigates the possibility of damaging someone else's snapshot by
+ *		  writing to the buffer too advanced value in case of slowness of
+ *		  another backend who generated csn earlier, but didn't manage to
+ *		  insert it before us.
+ *
+ *		* if GlobalSnapshotMapXmin() founds a gap in several seconds between
+ *		  current call and latest completed call then it should fill that gap
+ *		  with latest known values instead of new one. Otherwise it is
+ *		  possible (however highly unlikely) that this gap also happend
+ *		  between taking snapshot and call to GlobalSnapshotMapXmin() for some
+ *		  backend. And we are at risk to fill circullar buffer with
+ *		  oldestXmin's that are bigger then they actually were.
+ */
+void
+GlobalSnapshotMapXmin(GlobalCSN snapshot_global_csn)
+{
+	int offset, gap, i;
+	GlobalCSN csn_seconds;
+	GlobalCSN last_csn_seconds;
+	volatile TransactionId oldest_deferred_xmin;
+	TransactionId current_oldest_xmin, previous_oldest_xmin;
+
+	/* Callers should check config values */
+	Assert(global_snapshot_defer_time > 0);
+	Assert(gsXidMap != NULL);
+
+	/*
+	 * Round up global_csn to the next second -- pessimistically and safely.
+	 */
+	csn_seconds = (snapshot_global_csn / NSECS_PER_SEC + 1);
+
+	/*
+	 * Fast-path check. Avoid taking exclusive GlobalSnapshotXidMapLock lock
+	 * if oldestXid was already written to xmin_by_second[] for this rounded
+	 * global_csn.
+	 */
+	if (pg_atomic_read_u64(&gsXidMap->last_csn_seconds) >= csn_seconds)
+		return;
+
+	/* Ok, we have new entry (or entries) */
+	LWLockAcquire(GlobalSnapshotXidMapLock, LW_EXCLUSIVE);
+
+	/* Re-check last_csn_seconds under lock */
+	last_csn_seconds = pg_atomic_read_u64(&gsXidMap->last_csn_seconds);
+	if (last_csn_seconds >= csn_seconds)
+	{
+		LWLockRelease(GlobalSnapshotXidMapLock);
+		return;
+	}
+	pg_atomic_write_u64(&gsXidMap->last_csn_seconds, csn_seconds);
+
+	/*
+	 * Count oldest_xmin.
+	 *
+	 * It was possible to calculate oldest_xmin during corresponding snapshot
+	 * creation, but GetSnapshotData() intentionally reads only PgXact, but not
+	 * PgProc. And we need info about originalXmin (see comment to gsXidMap)
+	 * which is stored in PgProc because of threats in comments around PgXact
+	 * about extending it with new fields. So just calculate oldest_xmin again,
+	 * that anyway happens quite rarely.
+	 */
+	current_oldest_xmin = GetOldestXmin(NULL, PROCARRAY_NON_IMPORTED_XMIN);
+
+	previous_oldest_xmin = gsXidMap->xmin_by_second[gsXidMap->head];
+
+	Assert(TransactionIdIsNormal(current_oldest_xmin));
+	Assert(TransactionIdIsNormal(previous_oldest_xmin) || !track_global_snapshots);
+
+	gap = csn_seconds - last_csn_seconds;
+	offset = csn_seconds % gsXidMap->size;
+
+	/* Sanity check before we update head and gap */
+	Assert( gap >= 1 );
+	Assert( (gsXidMap->head + gap) % gsXidMap->size == offset );
+
+	gap = gap > gsXidMap->size ? gsXidMap->size : gap;
+	gsXidMap->head = offset;
+
+	/* Fill new entry with current_oldest_xmin */
+	gsXidMap->xmin_by_second[offset] = current_oldest_xmin;
+
+	/*
+	 * If we have gap then fill it with previous_oldest_xmin for reasons
+	 * outlined in comment above this function.
+	 */
+	for (i = 1; i < gap; i++)
+	{
+		offset = (offset + gsXidMap->size - 1) % gsXidMap->size;
+		gsXidMap->xmin_by_second[offset] = previous_oldest_xmin;
+	}
+
+	oldest_deferred_xmin =
+		gsXidMap->xmin_by_second[ (gsXidMap->head + 1) % gsXidMap->size ];
+
+	LWLockRelease(GlobalSnapshotXidMapLock);
+
+	/*
+	 * Advance procArray->global_snapshot_xmin after we released
+	 * GlobalSnapshotXidMapLock. Since we gather not xmin but oldestXmin, it
+	 * never goes backwards regardless of how slow we can do that.
+	 */
+	Assert(TransactionIdFollowsOrEquals(oldest_deferred_xmin,
+										ProcArrayGetGlobalSnapshotXmin()));
+	ProcArraySetGlobalSnapshotXmin(oldest_deferred_xmin);
+}
+
+
+/*
+ * GlobalSnapshotToXmin
+ *
+ * Get oldestXmin that took place when snapshot_global_csn was taken.
+ */
+TransactionId
+GlobalSnapshotToXmin(GlobalCSN snapshot_global_csn)
+{
+	TransactionId xmin;
+	GlobalCSN csn_seconds;
+	volatile GlobalCSN last_csn_seconds;
+
+	/* Callers should check config values */
+	Assert(global_snapshot_defer_time > 0);
+	Assert(gsXidMap != NULL);
+
+	/* Round down to get conservative estimates */
+	csn_seconds = (snapshot_global_csn / NSECS_PER_SEC);
+
+	LWLockAcquire(GlobalSnapshotXidMapLock, LW_SHARED);
+	last_csn_seconds = pg_atomic_read_u64(&gsXidMap->last_csn_seconds);
+	if (csn_seconds > last_csn_seconds)
+	{
+		/* we don't have entry for this global_csn yet, return latest known */
+		xmin = gsXidMap->xmin_by_second[gsXidMap->head];
+	}
+	else if (last_csn_seconds - csn_seconds < gsXidMap->size)
+	{
+		/* we are good, retrieve value from our map */
+		Assert(last_csn_seconds % gsXidMap->size == gsXidMap->head);
+		xmin = gsXidMap->xmin_by_second[csn_seconds % gsXidMap->size];
+	}
+	else
+	{
+		/* requested global_csn is too old, let caller know */
+		xmin = InvalidTransactionId;
+	}
+	LWLockRelease(GlobalSnapshotXidMapLock);
+
+	return xmin;
+}
+
+/*
+ * GlobalSnapshotGenerate
+ *
+ * Generate GlobalCSN which is actually a local time. Also we are forcing
+ * this time to be always increasing. Since now it is not uncommon to have
+ * millions of read transactions per second we are trying to use nanoseconds
+ * if such time resolution is available.
+ */
+GlobalCSN
+GlobalSnapshotGenerate(bool locked)
+{
+	instr_time	current_time;
+	GlobalCSN	global_csn;
+
+	Assert(track_global_snapshots || global_snapshot_defer_time > 0);
+
+	/*
+	 * TODO: create some macro that add small random shift to current time.
+	 */
+	INSTR_TIME_SET_CURRENT(current_time);
+	global_csn = (GlobalCSN) INSTR_TIME_GET_NANOSEC(current_time);
+
+	/* TODO: change to atomics? */
+	if (!locked)
+		SpinLockAcquire(&gsState->lock);
+
+	if (global_csn <= gsState->last_global_csn)
+		global_csn = ++gsState->last_global_csn;
+	else
+		gsState->last_global_csn = global_csn;
+
+	if (!locked)
+		SpinLockRelease(&gsState->lock);
+
+	return global_csn;
+}
+
+/*
+ * GlobalSnapshotSync
+ *
+ * Due to time desynchronization on different nodes we can receive global_csn
+ * which is greater than global_csn on this node. To preserve proper isolation
+ * this node needs to wait when such global_csn comes on local clock.
+ *
+ * This should happend relatively rare if nodes have running NTP/PTP/etc.
+ * Complain if wait time is more than SNAP_SYNC_COMPLAIN.
+ */
+void
+GlobalSnapshotSync(GlobalCSN remote_gcsn)
+{
+	GlobalCSN	local_gcsn;
+	GlobalCSN	delta;
+
+	Assert(track_global_snapshots);
+
+	for(;;)
+	{
+		SpinLockAcquire(&gsState->lock);
+		if (gsState->last_global_csn > remote_gcsn)
+		{
+			/* Everything is fine */
+			SpinLockRelease(&gsState->lock);
+			return;
+		}
+		else if ((local_gcsn = GlobalSnapshotGenerate(true)) >= remote_gcsn)
+		{
+			/*
+			 * Everything is fine too, but last_global_csn wasn't updated for
+			 * some time.
+			 */
+			SpinLockRelease(&gsState->lock);
+			return;
+		}
+		SpinLockRelease(&gsState->lock);
+
+		/* Okay we need to sleep now */
+		delta = remote_gcsn - local_gcsn;
+		if (delta > SNAP_DESYNC_COMPLAIN)
+			ereport(WARNING,
+				(errmsg("remote global snapshot exceeds ours by more than a second"),
+				 errhint("Consider running NTPd on servers participating in global transaction")));
+
+		/* TODO: report this sleeptime somewhere? */
+		pg_usleep((long) (delta/NSECS_PER_USEC));
+
+		/*
+		 * Loop that checks to ensure that we actually slept for specified
+		 * amount of time.
+		 */
+	}
+
+	Assert(false); /* Should not happend */
+	return;
+}
+
+/*
+ * TransactionIdGetGlobalCSN
+ *
+ * Get GlobalCSN for specified TransactionId taking care about special xids,
+ * xids beyond TransactionXmin and InDoubt states.
+ */
+GlobalCSN
+TransactionIdGetGlobalCSN(TransactionId xid)
+{
+	GlobalCSN global_csn;
+
+	Assert(track_global_snapshots);
+
+	/* Handle permanent TransactionId's for which we don't have mapping */
+	if (!TransactionIdIsNormal(xid))
+	{
+		if (xid == InvalidTransactionId)
+			return AbortedGlobalCSN;
+		if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+			return FrozenGlobalCSN;
+		Assert(false); /* Should not happend */
+	}
+
+	/*
+	 * For xids which less then TransactionXmin GlobalCSNLog can be already
+	 * trimmed but we know that such transaction is definetly not concurrently
+	 * running according to any snapshot including timetravel ones. Callers
+	 * should check TransactionDidCommit after.
+	 */
+	if (TransactionIdPrecedes(xid, TransactionXmin))
+		return FrozenGlobalCSN;
+
+	/* Read GlobalCSN from SLRU */
+	global_csn = GlobalCSNLogGetCSN(xid);
+
+	/*
+	 * If we faced InDoubt state then transaction is beeing committed and we
+	 * should wait until GlobalCSN will be assigned so that visibility check
+	 * could decide whether tuple is in snapshot. See also comments in
+	 * GlobalSnapshotPrecommit().
+	 */
+	if (GlobalCSNIsInDoubt(global_csn))
+	{
+		XactLockTableWait(xid, NULL, NULL, XLTW_None);
+		global_csn = GlobalCSNLogGetCSN(xid);
+		Assert(GlobalCSNIsNormal(global_csn) ||
+				GlobalCSNIsAborted(global_csn));
+	}
+
+	Assert(GlobalCSNIsNormal(global_csn) ||
+			GlobalCSNIsInProgress(global_csn) ||
+			GlobalCSNIsAborted(global_csn));
+
+	return global_csn;
+}
+
+/*
+ * XidInvisibleInGlobalSnapshot
+ *
+ * Version of XidInMVCCSnapshot for global transactions. For non-imported
+ * global snapshots this should give same results as XidInLocalMVCCSnapshot
+ * (except that aborts will be shown as invisible without going to clog) and to
+ * ensure such behaviour XidInMVCCSnapshot is coated with asserts that checks
+ * identicalness of XidInvisibleInGlobalSnapshot/XidInLocalMVCCSnapshot in
+ * case of ordinary snapshot.
+ */
+bool
+XidInvisibleInGlobalSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	GlobalCSN csn;
+
+	Assert(track_global_snapshots);
+
+	csn = TransactionIdGetGlobalCSN(xid);
+
+	if (GlobalCSNIsNormal(csn))
+	{
+		if (csn < snapshot->global_csn)
+			return false;
+		else
+			return true;
+	}
+	else if (GlobalCSNIsFrozen(csn))
+	{
+		/* It is bootstrap or frozen transaction */
+		return false;
+	}
+	else
+	{
+		/* It is aborted or in-progress */
+		Assert(GlobalCSNIsAborted(csn) || GlobalCSNIsInProgress(csn));
+		if (GlobalCSNIsAborted(csn))
+			Assert(TransactionIdDidAbort(xid));
+		return true;
+	}
+}
+
+
+/*****************************************************************************
+ * Functions to handle distributed commit on transaction coordinator:
+ * GlobalSnapshotPrepareCurrent() / GlobalSnapshotAssignCsnCurrent().
+ * Correspoding functions for remote nodes are defined in twophase.c:
+ * pg_global_snapshot_prepare/pg_global_snapshot_assign.
+ *****************************************************************************/
+
+
+/*
+ * GlobalSnapshotPrepareCurrent
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+GlobalCSN
+GlobalSnapshotPrepareCurrent()
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (TransactionIdIsValid(xid))
+	{
+		TransactionId *subxids;
+		int nsubxids = xactGetCommittedChildren(&subxids);
+		GlobalCSNLogSetCSN(xid, nsubxids,
+									subxids, InDoubtGlobalCSN);
+	}
+
+	/* Nothing to write if we don't have xid */
+
+	return GlobalSnapshotGenerate(false);
+}
+
+/*
+ * GlobalSnapshotAssignCsnCurrent
+ *
+ * Asign GlobalCSN for currently active transaction. GlobalCSN is supposedly
+ * maximal among of values returned by GlobalSnapshotPrepareCurrent and
+ * pg_global_snapshot_prepare.
+ */
+void
+GlobalSnapshotAssignCsnCurrent(GlobalCSN global_csn)
+{
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (!GlobalCSNIsNormal(global_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_global_snapshot_assign expects normal global_csn")));
+
+	/* Skip emtpty transactions */
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		return;
+
+	/* Set global_csn and defuse ProcArrayEndTransaction from assigning one */
+	pg_atomic_write_u64(&MyProc->assignedGlobalCsn, global_csn);
+}
+
+
+/*****************************************************************************
+ * Functions to handle global and local transactions commit.
+ *
+ * For local transactions GlobalSnapshotPrecommit sets InDoubt state before
+ * ProcArrayEndTransaction is called and transaction data potetntially becomes
+ * visible to other backends. ProcArrayEndTransaction (or ProcArrayRemove in
+ * twophase case) then acquires global_csn under ProcArray lock and stores it
+ * in proc->assignedGlobalCsn. It's important that global_csn for commit is
+ * generated under ProcArray lock, otherwise global and local snapshots won't
+ * be equivalent. Consequent call to GlobalSnapshotCommit will write
+ * proc->assignedGlobalCsn to GlobalCSNLog.
+ *
+ * Same rules applies to global transaction, except that global_csn is already
+ * assigned by GlobalSnapshotAssignCsnCurrent/pg_global_snapshot_assign and
+ * GlobalSnapshotPrecommit is basically no-op.
+ *
+ * GlobalSnapshotAbort is slightly different comparing to commit because abort
+ * can skip InDoubt phase and can be called for transaction subtree.
+ *****************************************************************************/
+
+
+/*
+ * GlobalSnapshotAbort
+ *
+ * Abort transaction in GlobalCsnLog. We can skip InDoubt state for aborts
+ * since no concurrent transactions allowed to see aborted data anyway.
+ */
+void
+GlobalSnapshotAbort(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	if (!track_global_snapshots)
+		return;
+
+	GlobalCSNLogSetCSN(xid, nsubxids, subxids, AbortedGlobalCSN);
+
+	/*
+	 * Clean assignedGlobalCsn anyway, as it was possibly set in
+	 * GlobalSnapshotAssignCsnCurrent.
+	 */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, InProgressGlobalCSN);
+}
+
+/*
+ * GlobalSnapshotPrecommit
+ *
+ * Set InDoubt status for local transaction that we are going to commit.
+ * This step is needed to achieve consistency between local snapshots and
+ * global csn-based snapshots. We don't hold ProcArray lock while writing
+ * csn for transaction in SLRU but instead we set InDoubt status before
+ * transaction is deleted from ProcArray so the readers who will read csn
+ * in the gap between ProcArray removal and GlobalCSN assignment can wait
+ * until GlobalCSN is finally assigned. See also TransactionIdGetGlobalCSN().
+ *
+ * For global transaction this does nothing as InDoubt state was written
+ * earlier.
+ *
+ * This should be called only from parallel group leader before backend is
+ * deleted from ProcArray.
+ */
+void
+GlobalSnapshotPrecommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	GlobalCSN oldAssignedGlobalCsn = InProgressGlobalCSN;
+	bool in_progress;
+
+	if (!track_global_snapshots)
+		return;
+
+	/* Set InDoubt status if it is local transaction */
+	in_progress = pg_atomic_compare_exchange_u64(&proc->assignedGlobalCsn,
+												 &oldAssignedGlobalCsn,
+												 InDoubtGlobalCSN);
+	if (in_progress)
+	{
+		Assert(GlobalCSNIsInProgress(oldAssignedGlobalCsn));
+		GlobalCSNLogSetCSN(xid, nsubxids,
+						   subxids, InDoubtGlobalCSN);
+	}
+	else
+	{
+		/* Otherwise we should have valid GlobalCSN by this time */
+		Assert(GlobalCSNIsNormal(oldAssignedGlobalCsn));
+		/* Also global transaction should already be in InDoubt state */
+		Assert(GlobalCSNIsInDoubt(GlobalCSNLogGetCSN(xid)));
+	}
+}
+
+/*
+ * GlobalSnapshotCommit
+ *
+ * Write GlobalCSN that were acquired earlier to GlobalCsnLog. Should be
+ * preceded by GlobalSnapshotPrecommit() so readers can wait until we finally
+ * finished writing to SLRU.
+ *
+ * Should be called after ProcArrayEndTransaction, but before releasing
+ * transaction locks, so that TransactionIdGetGlobalCSN can wait on this
+ * lock for GlobalCSN.
+ */
+void
+GlobalSnapshotCommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	volatile GlobalCSN assigned_global_csn;
+
+	if (!track_global_snapshots)
+		return;
+
+	if (!TransactionIdIsValid(xid))
+	{
+		assigned_global_csn = pg_atomic_read_u64(&proc->assignedGlobalCsn);
+		Assert(GlobalCSNIsInProgress(assigned_global_csn));
+		return;
+	}
+
+	/* Finally write resulting GlobalCSN in SLRU */
+	assigned_global_csn = pg_atomic_read_u64(&proc->assignedGlobalCsn);
+	Assert(GlobalCSNIsNormal(assigned_global_csn));
+	GlobalCSNLogSetCSN(xid, nsubxids,
+						   subxids, assigned_global_csn);
+
+	/* Reset for next transaction */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, InProgressGlobalCSN);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 0ecc02a3dd..c5d59fa2f2 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_snapshot.h"
 #include "access/global_csn_log.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
@@ -1477,8 +1478,34 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nabortrels, abortrels,
 									   gid);
 
+	/*
+	 * GlobalSnapshot callbacks that should be called right before we are
+	 * going to become visible. Details in comments to this functions.
+	 */
+	if (isCommit)
+		GlobalSnapshotPrecommit(proc, xid, hdr->nsubxacts, children);
+	else
+		GlobalSnapshotAbort(proc, xid, hdr->nsubxacts, children);
+
+
 	ProcArrayRemove(proc, latestXid);
 
+	/*
+	 * Stamp our transaction with GlobalCSN in GlobalCsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks, since TransactionIdGetGlobalCSN relies on
+	 * XactLockTableWait to await global_csn.
+	 */
+	if (isCommit)
+	{
+		GlobalSnapshotCommit(proc, xid, hdr->nsubxacts, children);
+	}
+	else
+	{
+		Assert(GlobalCSNIsInProgress(
+				   pg_atomic_read_u64(&proc->assignedGlobalCsn)));
+	}
+
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
 	 * no one else will try to commit/rollback, and so it will be recycled if
@@ -2439,3 +2466,132 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 		RemoveTwoPhaseFile(xid, giveWarning);
 	RemoveGXact(gxact);
 }
+
+/*
+ * GlobalSnapshotPrepareTwophase
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ *
+ * This function is a counterpart of GlobalSnapshotPrepareCurrent() for
+ * twophase transactions.
+ */
+static GlobalCSN
+GlobalSnapshotPrepareTwophase(const char *gid)
+{
+	GlobalTransaction gxact;
+	PGXACT	   *pgxact;
+	char	   *buf;
+	TransactionId xid;
+	xl_xact_parsed_prepare parsed;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
+	xid = pgxact->xid;
+
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, true);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+	ParsePrepareRecord(0, (xl_xact_prepare *) buf, &parsed);
+
+	GlobalCSNLogSetCSN(xid, parsed.nsubxacts,
+					parsed.subxacts, InDoubtGlobalCSN);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	pfree(buf);
+
+	return GlobalSnapshotGenerate(false);
+}
+
+/*
+ * SQL interface to GlobalSnapshotPrepareTwophase()
+ *
+ * TODO: Rewrite this as PREPARE TRANSACTION 'gid' RETURNING SNAPSHOT
+ */
+Datum
+pg_global_snapshot_prepare(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	GlobalCSN	global_csn;
+
+	global_csn = GlobalSnapshotPrepareTwophase(gid);
+
+	PG_RETURN_INT64(global_csn);
+}
+
+
+/*
+ * TwoPhaseAssignGlobalCsn
+ *
+ * Asign GlobalCSN for currently active transaction. GlobalCSN is supposedly
+ * maximal among of values returned by GlobalSnapshotPrepareCurrent and
+ * pg_global_snapshot_prepare.
+ *
+ * This function is a counterpart of GlobalSnapshotAssignCsnCurrent() for
+ * twophase transactions.
+ */
+static void
+GlobalSnapshotAssignCsnTwoPhase(const char *gid, GlobalCSN global_csn)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (!GlobalCSNIsNormal(global_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_global_snapshot_assign expects normal global_csn")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	/* Set global_csn and defuse ProcArrayRemove from assigning one. */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, global_csn);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * SQL interface to GlobalSnapshotAssignCsnTwoPhase()
+ *
+ * TODO: Rewrite this as COMMIT PREPARED 'gid' SNAPSHOT 'global_csn'
+ */
+Datum
+pg_global_snapshot_assign(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	GlobalCSN	global_csn = PG_GETARG_INT64(1);
+
+	GlobalSnapshotAssignCsnTwoPhase(gid, global_csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 3984dd3e1a..8fddb6edaf 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_snapshot.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -1433,6 +1434,14 @@ RecordTransactionCommit(void)
 
 	/* Reset XactLastRecEnd until the next transaction writes something */
 	XactLastRecEnd = 0;
+
+	/*
+	 * Mark our transaction as InDoubt in GlobalCsnLog and get ready for
+	 * commit.
+	 */
+	if (markXidCommitted)
+		GlobalSnapshotPrecommit(MyProc, xid, nchildren, children);
+
 cleanup:
 	/* Clean up local data */
 	if (rels)
@@ -1694,6 +1703,11 @@ RecordTransactionAbort(bool isSubXact)
 	 */
 	TransactionIdAbortTree(xid, nchildren, children);
 
+	/*
+	 * Mark our transaction as Aborted in GlobalCsnLog.
+	 */
+	GlobalSnapshotAbort(MyProc, xid, nchildren, children);
+
 	END_CRIT_SECTION();
 
 	/* Compute latestXid while we have the child XIDs handy */
@@ -2183,6 +2197,21 @@ CommitTransaction(void)
 	 */
 	ProcArrayEndTransaction(MyProc, latestXid);
 
+	/*
+	 * Stamp our transaction with GlobalCSN in GlobalCsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks.
+	 */
+	if (!is_parallel_worker)
+	{
+		TransactionId  xid = GetTopTransactionIdIfAny();
+		TransactionId *subxids;
+		int			   nsubxids;
+
+		nsubxids = xactGetCommittedChildren(&subxids);
+		GlobalSnapshotCommit(MyProc, xid, nsubxids, subxids);
+	}
+
 	/*
 	 * This is all post-commit cleanup.  Note that if an error is raised here,
 	 * it's too late to abort the transaction.  This should be just
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 285d9d442e..b485db5456 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7058,6 +7058,7 @@ StartupXLOG(void)
 			StartupCLOG();
 			StartupGlobalCSNLog(oldestActiveXID);
 			StartupSUBTRANS(oldestActiveXID);
+			GlobalSnapshotStartup(oldestActiveXID);
 
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
@@ -7876,6 +7877,7 @@ StartupXLOG(void)
 		StartupCLOG();
 		StartupGlobalCSNLog(oldestActiveXID);
 		StartupSUBTRANS(oldestActiveXID);
+		GlobalSnapshotStartup(oldestActiveXID);
 	}
 
 	/*
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index dc2d2959c4..d1819dc2c8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -22,6 +22,7 @@
 #include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
+#include "access/global_snapshot.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -145,6 +146,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
 		size = add_size(size, ApplyLauncherShmemSize());
+		size = add_size(size, GlobalSnapshotShmemSize());
 		size = add_size(size, SnapMgrShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
@@ -266,6 +268,7 @@ CreateSharedMemoryAndSemaphores(void)
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	AsyncShmemInit();
+	GlobalSnapshotShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 8ae4906474..23db5039a4 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -47,6 +47,7 @@
 
 #include "access/clog.h"
 #include "access/global_csn_log.h"
+#include "access/global_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -95,6 +96,8 @@ typedef struct ProcArrayStruct
 	TransactionId replication_slot_xmin;
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
+	/* xmin of oldest active global snapshot */
+	TransactionId global_snapshot_xmin;
 
 	/* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
@@ -250,6 +253,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		procArray->global_snapshot_xmin = InvalidTransactionId;
 	}
 
 	allProcs = ProcGlobal->allProcs;
@@ -355,6 +359,17 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 								  latestXid))
 			ShmemVariableCache->latestCompletedXid = latestXid;
+
+		/*
+		 * Assign global csn while holding ProcArrayLock for non-global
+		 * COMMIT PREPARED. After lock is released consequent
+		 * GlobalSnapshotCommit() will write this value to GlobalCsnLog.
+		 *
+		 * In case of global commit proc->assignedGlobalCsn is already set
+		 * by prior AssignGlobalCsn().
+		 */
+		if (GlobalCSNIsInDoubt(pg_atomic_read_u64(&proc->assignedGlobalCsn)))
+			pg_atomic_write_u64(&proc->assignedGlobalCsn, GlobalSnapshotGenerate(false));
 	}
 	else
 	{
@@ -435,6 +450,8 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 
 		proc->lxid = InvalidLocalTransactionId;
 		pgxact->xmin = InvalidTransactionId;
+		proc->originalXmin = InvalidTransactionId;
+
 		/* must be cleared with xid/xmin: */
 		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 		proc->delayChkpt = false; /* be sure this is cleared in abort */
@@ -457,6 +474,8 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	pgxact->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
+
 	/* must be cleared with xid/xmin: */
 	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 	proc->delayChkpt = false; /* be sure this is cleared in abort */
@@ -470,6 +489,20 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 							  latestXid))
 		ShmemVariableCache->latestCompletedXid = latestXid;
+
+	/*
+	 * Assign global csn while holding ProcArrayLock for non-global
+	 * COMMIT. After lock is released consequent GlobalSnapshotFinish() will
+	 * write this value to GlobalCsnLog.
+	 *
+	 * In case of global commit MyProc->assignedGlobalCsn is already set
+	 * by prior AssignGlobalCsn().
+	 *
+	 * TODO: in case of group commit we can generate one GlobalSnapshot for
+	 * whole group to save time on timestamp aquisition.
+	 */
+	if (GlobalCSNIsInDoubt(pg_atomic_read_u64(&proc->assignedGlobalCsn)))
+		pg_atomic_write_u64(&proc->assignedGlobalCsn, GlobalSnapshotGenerate(false));
 }
 
 /*
@@ -613,6 +646,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	pgxact->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
 
 	/* redundant, but just in case */
@@ -1315,6 +1349,7 @@ GetOldestXmin(Relation rel, int flags)
 
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+	TransactionId global_snapshot_xmin = InvalidTransactionId;
 
 	/*
 	 * If we're not computing a relation specific limit, or if a shared
@@ -1351,8 +1386,9 @@ GetOldestXmin(Relation rel, int flags)
 			proc->databaseId == MyDatabaseId ||
 			proc->databaseId == 0)	/* always include WalSender */
 		{
-			/* Fetch xid just once - see GetNewTransactionId */
+			/* Fetch both xids just once - see GetNewTransactionId */
 			TransactionId xid = UINT32_ACCESS_ONCE(pgxact->xid);
+			TransactionId original_xmin = UINT32_ACCESS_ONCE(proc->originalXmin);
 
 			/* First consider the transaction's own Xid, if any */
 			if (TransactionIdIsNormal(xid) &&
@@ -1365,8 +1401,17 @@ GetOldestXmin(Relation rel, int flags)
 			 * We must check both Xid and Xmin because a transaction might
 			 * have an Xmin but not (yet) an Xid; conversely, if it has an
 			 * Xid, that could determine some not-yet-set Xmin.
+			 *
+			 * In case of oldestXmin calculation for GlobalSnapshotMapXmin()
+			 * pgxact->xmin should be changed to proc->originalXmin. Details
+			 * in commets to GlobalSnapshotMapXmin.
 			 */
-			xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+			if ((flags & PROCARRAY_NON_IMPORTED_XMIN) &&
+					TransactionIdIsValid(original_xmin))
+				xid = original_xmin;
+			else
+				xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+
 			if (TransactionIdIsNormal(xid) &&
 				TransactionIdPrecedes(xid, result))
 				result = xid;
@@ -1380,6 +1425,7 @@ GetOldestXmin(Relation rel, int flags)
 	 */
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	global_snapshot_xmin = ProcArrayGetGlobalSnapshotXmin();
 
 	if (RecoveryInProgress())
 	{
@@ -1421,6 +1467,11 @@ GetOldestXmin(Relation rel, int flags)
 			result = FirstNormalTransactionId;
 	}
 
+	if (!(flags & PROCARRAY_NON_IMPORTED_XMIN) &&
+		TransactionIdIsValid(global_snapshot_xmin) &&
+		NormalTransactionIdPrecedes(global_snapshot_xmin, result))
+		result = global_snapshot_xmin;
+
 	/*
 	 * Check whether there are replication slots requiring an older xmin.
 	 */
@@ -1515,8 +1566,10 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	GlobalCSN	global_csn = FrozenGlobalCSN;
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+	TransactionId global_snapshot_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -1708,10 +1761,18 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	global_snapshot_xmin = ProcArrayGetGlobalSnapshotXmin();
 
 	if (!TransactionIdIsValid(MyPgXact->xmin))
 		MyPgXact->xmin = TransactionXmin = xmin;
 
+	/*
+	 * Take GlobalCSN under ProcArrayLock so the local/global snapshot stays
+	 * synchronized.
+	 */
+	if (track_global_snapshots)
+		global_csn = GlobalSnapshotGenerate(false);
+
 	LWLockRelease(ProcArrayLock);
 
 	/*
@@ -1727,6 +1788,10 @@ GetSnapshotData(Snapshot snapshot)
 	if (!TransactionIdIsNormal(RecentGlobalXmin))
 		RecentGlobalXmin = FirstNormalTransactionId;
 
+	if (/*track_global_snapshots && */TransactionIdIsValid(global_snapshot_xmin) &&
+		TransactionIdPrecedes(global_snapshot_xmin, RecentGlobalXmin))
+		RecentGlobalXmin = global_snapshot_xmin;
+
 	/* Check whether there's a replication slot requiring an older xmin. */
 	if (TransactionIdIsValid(replication_slot_xmin) &&
 		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
@@ -1782,6 +1847,11 @@ GetSnapshotData(Snapshot snapshot)
 		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin);
 	}
 
+	snapshot->imported_global_csn = false;
+	snapshot->global_csn = global_csn;
+	if (global_snapshot_defer_time > 0 && IsUnderPostmaster)
+		GlobalSnapshotMapXmin(snapshot->global_csn);
+
 	return snapshot;
 }
 
@@ -3129,6 +3199,24 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
+/*
+ * ProcArraySetGlobalSnapshotXmin
+ */
+void
+ProcArraySetGlobalSnapshotXmin(TransactionId xmin)
+{
+	/* We rely on atomic fetch/store of xid */
+	procArray->global_snapshot_xmin = xmin;
+}
+
+/*
+ * ProcArrayGetGlobalSnapshotXmin
+ */
+TransactionId
+ProcArrayGetGlobalSnapshotXmin(void)
+{
+	return procArray->global_snapshot_xmin;
+}
 
 #define XidCacheRemove(i) \
 	do { \
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index fe18c93b61..86d0a0acae 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,4 @@ OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 CLogTruncationLock					44
 GlobalCSNLogControlLock				45
+GlobalSnapshotXidMapLock			46
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 5aa19d3f78..8a47a2d375 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -37,6 +37,7 @@
 
 #include "access/transam.h"
 #include "access/twophase.h"
+#include "access/global_snapshot.h"
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -441,6 +442,9 @@ InitProcess(void)
 	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
 	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
 
+	MyProc->originalXmin = InvalidTransactionId;
+	pg_atomic_init_u64(&MyProc->assignedGlobalCsn, InProgressGlobalCSN);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
@@ -584,6 +588,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
 	MyProc->waitProcLock = NULL;
+	MyProc->originalXmin = InvalidTransactionId;
 #ifdef USE_ASSERT_CHECKING
 	{
 		int			i;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 0ca331c6f9..7154c3499e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,6 +28,7 @@
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
+#include "access/global_snapshot.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
 #include "access/transam.h"
@@ -1181,7 +1182,7 @@ static struct config_bool ConfigureNamesBool[] =
 			gettext_noop("Used to achieve REPEATEBLE READ isolation level for postgres_fdw transactions.")
 		},
 		&track_global_snapshots,
-		true, /* XXX: set true to simplify tesing. XXX2: Seems that RESOURCES_MEM isn't the best catagory */
+		false, /* XXX: Seems that RESOURCES_MEM isn't the best catagory */
 		NULL, NULL, NULL
 	},
 	{
@@ -2495,6 +2496,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"global_snapshot_defer_time", PGC_POSTMASTER, REPLICATION_MASTER,
+			gettext_noop("Minimal age of records which allowed to be vacuumed, in seconds."),
+			NULL
+		},
+		&global_snapshot_defer_time,
+		5, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * See also CheckRequiredParameterValues() if this parameter changes
 	 */
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 995b6ca155..0fd7d8501c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -306,6 +306,8 @@
 				# and comma-separated list of application_name
 				# from standby(s); '*' = all
 #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+#global_snapshot_defer_time = 0	# minimal age of records which allowed to be
+				# vacuumed, in seconds
 
 # - Standby Servers -
 
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 1c063c592c..3d925a7866 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/global_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -247,6 +248,8 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	GlobalCSN	global_csn;
+	bool		imported_global_csn;
 } SerializedSnapshotData;
 
 Size
@@ -1024,7 +1027,9 @@ SnapshotResetXmin(void)
 										pairingheap_first(&RegisteredSnapshots));
 
 	if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin))
+	{
 		MyPgXact->xmin = minSnapshot->xmin;
+	}
 }
 
 /*
@@ -2115,6 +2120,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.global_csn = snapshot->global_csn;
+	serialized_snapshot.imported_global_csn = snapshot->imported_global_csn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -2189,6 +2196,8 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->global_csn = serialized_snapshot.global_csn;
+	snapshot->imported_global_csn = serialized_snapshot.imported_global_csn;
 
 	/* Copy XIDs, if present. */
 	if (serialized_snapshot.xcnt > 0)
@@ -2228,8 +2237,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
 }
 
 /*
- * XidInMVCCSnapshot
- *		Is the given XID still-in-progress according to the snapshot?
+ * XidInLocalMVCCSnapshot
+ *		Is the given XID still-in-progress according to the local snapshot?
  *
  * Note: GetSnapshotData never stores either top xid or subxids of our own
  * backend into a snapshot, so these xids will not be reported as "running"
@@ -2237,8 +2246,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
  * TransactionIdIsCurrentTransactionId first, except when it's known the
  * XID could not be ours anyway.
  */
-bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+static bool
+XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
 	uint32		i;
 
@@ -2348,3 +2357,153 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+/*
+ * XidInMVCCSnapshot
+ *
+ * Check whether this xid is in snapshot, taking into account fact that
+ * snapshot can be global. When track_global_snapshots is switched off
+ * just call XidInLocalMVCCSnapshot().
+ */
+bool
+XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	bool in_snapshot;
+
+	if (snapshot->imported_global_csn)
+	{
+		Assert(track_global_snapshots);
+		/* No point to using snapshot info except CSN */
+		return XidInvisibleInGlobalSnapshot(xid, snapshot);
+	}
+
+	in_snapshot = XidInLocalMVCCSnapshot(xid, snapshot);
+
+	if (!track_global_snapshots)
+	{
+		Assert(GlobalCSNIsFrozen(snapshot->global_csn));
+		return in_snapshot;
+	}
+
+	if (in_snapshot)
+	{
+		/*
+		 * This xid may be already in unknown state and in that case
+		 * we must wait and recheck.
+		 *
+		 * TODO: this check can be skipped if we know for sure that there were
+		 * no global transactions when this snapshot was taken. That requires
+		 * some changes to mechanisms of global snapshots exprot/import (if
+		 * backend set xmin then we should have a-priori knowledge that this
+		 * transaction going to be global or local -- right now this is not
+		 * enforced). Leave that for future and don't complicate this patch.
+		 */
+		return XidInvisibleInGlobalSnapshot(xid, snapshot);
+	}
+	else
+	{
+#ifdef USE_ASSERT_CHECKING
+		/* Check that global snapshot gives the same results as local one */
+		if (XidInvisibleInGlobalSnapshot(xid, snapshot))
+		{
+			GlobalCSN gcsn = TransactionIdGetGlobalCSN(xid);
+			Assert(GlobalCSNIsAborted(gcsn));
+		}
+#endif
+		return false;
+	}
+}
+
+/*
+ * ExportGlobalSnapshot
+ *
+ * Export global_csn so that caller can expand this transaction to other
+ * nodes.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+GlobalCSN
+ExportGlobalSnapshot()
+{
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not export global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "track_global_snapshots")));
+
+	return CurrentSnapshot->global_csn;
+}
+
+/* SQL accessor to ExportGlobalSnapshot() */
+Datum
+pg_global_snapshot_export(PG_FUNCTION_ARGS)
+{
+	GlobalCSN	global_csn = ExportGlobalSnapshot();
+	PG_RETURN_UINT64(global_csn);
+}
+
+/*
+ * ImportGlobalSnapshot
+ *
+ * Import global_csn and retract this backends xmin to the value that was
+ * actual when we had such global_csn.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+void
+ImportGlobalSnapshot(GlobalCSN snap_global_csn)
+{
+	volatile TransactionId xmin;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "track_global_snapshots")));
+
+	if (global_snapshot_defer_time <= 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is positive.",
+					 "global_snapshot_defer_time")));
+
+	/*
+	 * Call GlobalSnapshotToXmin under ProcArrayLock to avoid situation that
+	 * resulting xmin will be evicted from map before we will set it into our
+	 * backend's xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	xmin = GlobalSnapshotToXmin(snap_global_csn);
+	if (!TransactionIdIsValid(xmin))
+	{
+		LWLockRelease(ProcArrayLock);
+		elog(ERROR, "GlobalSnapshotToXmin: global snapshot too old");
+	}
+	MyProc->originalXmin = MyPgXact->xmin;
+	MyPgXact->xmin = TransactionXmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	CurrentSnapshot->xmin = xmin; /* defuse SnapshotResetXmin() */
+	CurrentSnapshot->global_csn = snap_global_csn;
+	CurrentSnapshot->imported_global_csn = true;
+	GlobalSnapshotSync(snap_global_csn);
+
+	Assert(TransactionIdPrecedesOrEquals(RecentGlobalXmin, xmin));
+	Assert(TransactionIdPrecedesOrEquals(RecentGlobalDataXmin, xmin));
+}
+
+/* SQL accessor to ImportGlobalSnapshot() */
+Datum
+pg_global_snapshot_import(PG_FUNCTION_ARGS)
+{
+	GlobalCSN	global_csn = PG_GETARG_UINT64(0);
+	ImportGlobalSnapshot(global_csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/include/access/global_snapshot.h b/src/include/access/global_snapshot.h
new file mode 100644
index 0000000000..246b180cfd
--- /dev/null
+++ b/src/include/access/global_snapshot.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * global_snapshot.h
+ *	  Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/global_snapshot.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GLOBAL_SNAPSHOT_H
+#define GLOBAL_SNAPSHOT_H
+
+#include "port/atomics.h"
+#include "storage/lock.h"
+#include "utils/snapshot.h"
+#include "utils/guc.h"
+
+/*
+ * snapshot.h is used in frontend code so atomic variant of GlobalCSN type
+ * is defined here.
+ */
+typedef pg_atomic_uint64 GlobalCSN_atomic;
+
+#define InProgressGlobalCSN	 UINT64CONST(0x0)
+#define AbortedGlobalCSN	 UINT64CONST(0x1)
+#define FrozenGlobalCSN		 UINT64CONST(0x2)
+#define InDoubtGlobalCSN	 UINT64CONST(0x3)
+#define FirstNormalGlobalCSN UINT64CONST(0x4)
+
+#define GlobalCSNIsInProgress(csn)	((csn) == InProgressGlobalCSN)
+#define GlobalCSNIsAborted(csn)		((csn) == AbortedGlobalCSN)
+#define GlobalCSNIsFrozen(csn)		((csn) == FrozenGlobalCSN)
+#define GlobalCSNIsInDoubt(csn)		((csn) == InDoubtGlobalCSN)
+#define GlobalCSNIsNormal(csn)		((csn) >= FirstNormalGlobalCSN)
+
+
+extern int global_snapshot_defer_time;
+
+
+extern Size GlobalSnapshotShmemSize(void);
+extern void GlobalSnapshotShmemInit(void);
+extern void GlobalSnapshotStartup(TransactionId oldestActiveXID);
+
+extern void GlobalSnapshotMapXmin(GlobalCSN snapshot_global_csn);
+extern TransactionId GlobalSnapshotToXmin(GlobalCSN snapshot_global_csn);
+
+extern GlobalCSN GlobalSnapshotGenerate(bool locked);
+
+extern bool XidInvisibleInGlobalSnapshot(TransactionId xid, Snapshot snapshot);
+
+extern void GlobalSnapshotSync(GlobalCSN remote_gcsn);
+
+extern GlobalCSN TransactionIdGetGlobalCSN(TransactionId xid);
+
+extern GlobalCSN GlobalSnapshotPrepareGlobal(const char *gid);
+extern void GlobalSnapshotAssignCsnGlobal(const char *gid,
+										  GlobalCSN global_csn);
+
+extern GlobalCSN GlobalSnapshotPrepareCurrent(void);
+extern void GlobalSnapshotAssignCsnCurrent(GlobalCSN global_csn);
+
+extern void GlobalSnapshotAbort(PGPROC *proc, TransactionId xid, int nsubxids,
+								TransactionId *subxids);
+extern void GlobalSnapshotPrecommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void GlobalSnapshotCommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+
+#endif							/* GLOBAL_SNAPSHOT_H */
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 2ca71c3445..b4899f3754 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -18,6 +18,7 @@
 #include "access/xlogdefs.h"
 #include "datatype/timestamp.h"
 #include "storage/lock.h"
+#include "utils/snapshot.h"
 
 /*
  * GlobalTransactionData is defined in twophase.c; other places have no
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index a85c78e796..64c3c71df3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10945,4 +10945,18 @@
   proname => 'is_normalized', prorettype => 'bool',
   proargtypes => 'text text', prosrc => 'unicode_is_normalized' },
 
+# global transaction handling
+{ oid => '4388', descr => 'export global transaction snapshot',
+  proname => 'pg_global_snapshot_export', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_global_snapshot_export' },
+{ oid => '4389', descr => 'import global transaction snapshot',
+  proname => 'pg_global_snapshot_import', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'int8', prosrc => 'pg_global_snapshot_import' },
+{ oid => '4390', descr => 'prepare distributed transaction for commit, get global_csn',
+  proname => 'pg_global_snapshot_prepare', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => 'text', prosrc => 'pg_global_snapshot_prepare' },
+{ oid => '4391', descr => 'assign global_csn to distributed transaction',
+  proname => 'pg_global_snapshot_assign', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_global_snapshot_assign' },
+
 ]
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 6be6d35d1e..583b1beea5 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -93,6 +93,9 @@ typedef struct
 #define USECS_PER_MINUTE INT64CONST(60000000)
 #define USECS_PER_SEC	INT64CONST(1000000)
 
+#define NSECS_PER_SEC	INT64CONST(1000000000)
+#define NSECS_PER_USEC	INT64CONST(1000)
+
 /*
  * We allow numeric timezone offsets up to 15:59:59 either way from Greenwich.
  * Currently, the record holders for wackiest offsets in actual use are zones
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index d349510b7c..5cdf2e17cb 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -280,6 +280,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_GETARG_FLOAT4(n)  DatumGetFloat4(PG_GETARG_DATUM(n))
 #define PG_GETARG_FLOAT8(n)  DatumGetFloat8(PG_GETARG_DATUM(n))
 #define PG_GETARG_INT64(n)	 DatumGetInt64(PG_GETARG_DATUM(n))
+#define PG_GETARG_UINT64(n)	 DatumGetUInt64(PG_GETARG_DATUM(n))
 /* use this if you want the raw, possibly-toasted input datum: */
 #define PG_GETARG_RAW_VARLENA_P(n)	((struct varlena *) PG_GETARG_POINTER(n))
 /* use this if you want the input datum de-toasted: */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index d6459327cc..4ac23da654 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,6 +141,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* !HAVE_CLOCK_GETTIME */
 
 /* Use gettimeofday() */
@@ -205,6 +208,10 @@ typedef struct timeval instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + \
+		(uint64) (t).tv_usec * (uint64) 1000)
+
 #endif							/* HAVE_CLOCK_GETTIME */
 
 #else							/* WIN32 */
@@ -237,6 +244,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index ae4f573ab4..da84dbf04c 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -15,8 +15,10 @@
 #define _PROC_H_
 
 #include "access/clog.h"
+#include "access/global_snapshot.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
+#include "utils/snapshot.h"
 #include "storage/latch.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
@@ -57,6 +59,7 @@ struct XidCache
 #define		PROC_IN_LOGICAL_DECODING	0x10	/* currently doing logical
 												 * decoding outside xact */
 #define		PROC_RESERVED				0x20	/* reserved for procarray */
+#define		PROC_RESERVED2				0x40	/* reserved for procarray */
 
 /* flags reset at EOXact */
 #define		PROC_VACUUM_STATE_MASK \
@@ -205,6 +208,18 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/*
+	 * assignedGlobalCsn holds GlobalCSN for this transaction.  It is generated
+	 * under a ProcArray lock and later is writter to a GlobalCSNLog.  This
+	 * variable defined as atomic only for case of group commit, in all other
+	 * scenarios only backend responsible for this proc entry is working with
+	 * this variable.
+	 */
+	GlobalCSN_atomic assignedGlobalCsn;
+
+	/* Original xmin of this backend before global snapshot was imported */
+	TransactionId originalXmin;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index a5c7d0c064..452ae5d547 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -36,6 +36,10 @@
 
 #define		PROCARRAY_SLOTS_XMIN			0x20	/* replication slot xmin,
 													 * catalog_xmin */
+
+#define		PROCARRAY_NON_IMPORTED_XMIN		0x40	/* use originalXmin instead
+													 * of xmin to properly
+													 * maintain gsXidMap */
 /*
  * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
  * PGXACT->vacuumFlags. Other flags are used for different purposes and
@@ -125,4 +129,8 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
 
+extern void ProcArraySetGlobalSnapshotXmin(TransactionId xmin);
+
+extern TransactionId ProcArrayGetGlobalSnapshotXmin(void);
+
 #endif							/* PROCARRAY_H */
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index b28d13ce84..f4768bc6d4 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -127,6 +127,9 @@ extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
 
+extern GlobalCSN ExportGlobalSnapshot(void);
+extern void ImportGlobalSnapshot(GlobalCSN snap_global_csn);
+
 extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 57d2dfaa67..71c92c69f4 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -204,6 +204,14 @@ typedef struct SnapshotData
 
 	TimestampTz whenTaken;		/* timestamp when snapshot was taken */
 	XLogRecPtr	lsn;			/* position in the WAL stream when taken */
+
+	/*
+	 * GlobalCSN for cross-node snapshot isolation support.
+	 * Will be used only if track_global_snapshots is enabled.
+	 */
+	GlobalCSN	global_csn;
+	/* Did we have our own global_csn or imported one from different node */
+	bool		imported_global_csn;
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */
-- 
2.17.1

0003-postgres_fdw-support-for-global-snapshots-v3.patchtext/x-patch; charset=UTF-8; name=0003-postgres_fdw-support-for-global-snapshots-v3.patchDownload

From d3b8cb68ca1bb8721f738c4993083ea4cca3d255 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Tue, 12 May 2020 08:31:59 +0500
Subject: [PATCH 3/3] postgres_fdw-support-for-global-snapshots-v3

---
 contrib/postgres_fdw/Makefile                 |   9 +
 contrib/postgres_fdw/connection.c             | 290 ++++++++++++++++--
 contrib/postgres_fdw/postgres_fdw.c           |  12 +
 contrib/postgres_fdw/postgres_fdw.h           |   2 +
 .../postgres_fdw/t/001_bank_coordinator.pl    | 264 ++++++++++++++++
 .../postgres_fdw/t/002_bank_participant.pl    | 240 +++++++++++++++
 src/test/perl/PostgresNode.pm                 |  35 +++
 7 files changed, 826 insertions(+), 26 deletions(-)
 create mode 100644 contrib/postgres_fdw/t/001_bank_coordinator.pl
 create mode 100644 contrib/postgres_fdw/t/002_bank_participant.pl

diff --git a/contrib/postgres_fdw/Makefile b/contrib/postgres_fdw/Makefile
index ee8a80a392..07091f630e 100644
--- a/contrib/postgres_fdw/Makefile
+++ b/contrib/postgres_fdw/Makefile
@@ -29,3 +29,12 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+# Global makefile will do temp-install for 'check'. Since REGRESS is defined,
+# PGXS (included from contrib-global.mk or directly) will care to add
+# postgres_fdw to it as EXTRA_INSTALL and build pg_regress. It will also
+# actually run pg_regress, so the only thing left is tap tests.
+check: tapcheck
+
+tapcheck: temp-install
+	$(prove_check)
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index e45647f3ea..8e33ae0af7 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -12,8 +12,10 @@
  */
 #include "postgres.h"
 
+#include "access/global_snapshot.h"
 #include "access/htup_details.h"
 #include "access/xact.h"
+#include "access/xlog.h" /* GetSystemIdentifier() */
 #include "catalog/pg_user_mapping.h"
 #include "commands/defrem.h"
 #include "mb/pg_wchar.h"
@@ -25,6 +27,8 @@
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/snapshot.h"
 #include "utils/syscache.h"
 
 /*
@@ -65,6 +69,21 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/*
+ * FdwTransactionState
+ *
+ * Holds number of open remote transactions and shared state
+ * needed for all connection entries.
+ */
+typedef struct FdwTransactionState
+{
+	char		*gid;
+	int			nparticipants;
+	GlobalCSN	global_csn;
+	bool		two_phase_commit;
+} FdwTransactionState;
+static FdwTransactionState *fdwTransState;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -72,6 +91,9 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/* counter of prepared tx made by this backend */
+static int two_phase_xact_count = 0;
+
 /* prototypes of private functions */
 static PGconn *connect_pg_server(ForeignServer *server, UserMapping *user);
 static void disconnect_pg_server(ConnCacheEntry *entry);
@@ -80,6 +102,7 @@ static void configure_remote_session(PGconn *conn);
 static void do_sql_command(PGconn *conn, const char *sql);
 static void begin_remote_xact(ConnCacheEntry *entry);
 static void pgfdw_xact_callback(XactEvent event, void *arg);
+static void deallocate_prepared_stmts(ConnCacheEntry *entry);
 static void pgfdw_subxact_callback(SubXactEvent event,
 								   SubTransactionId mySubid,
 								   SubTransactionId parentSubid,
@@ -136,6 +159,15 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 									  pgfdw_inval_callback, (Datum) 0);
 	}
 
+	/* allocate FdwTransactionState */
+	if (fdwTransState == NULL)
+	{
+		MemoryContext oldcxt;
+		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+		fdwTransState = palloc0(sizeof(FdwTransactionState));
+		MemoryContextSwitchTo(oldcxt);
+	}
+
 	/* Set flag that we did GetConnection during the current transaction */
 	xact_got_connection = true;
 
@@ -446,7 +478,8 @@ configure_remote_session(PGconn *conn)
 }
 
 /*
- * Convenience subroutine to issue a non-data-returning SQL command to remote
+ * Convenience subroutine to issue a non-data-returning SQL command or
+ * statement to remote node.
  */
 static void
 do_sql_command(PGconn *conn, const char *sql)
@@ -456,7 +489,8 @@ do_sql_command(PGconn *conn, const char *sql)
 	if (!PQsendQuery(conn, sql))
 		pgfdw_report_error(ERROR, NULL, conn, false, sql);
 	res = pgfdw_get_result(conn, sql);
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	if (PQresultStatus(res) != PGRES_COMMAND_OK &&
+			PQresultStatus(res) != PGRES_TUPLES_OK)
 		pgfdw_report_error(ERROR, res, conn, true, sql);
 	PQclear(res);
 }
@@ -484,6 +518,10 @@ begin_remote_xact(ConnCacheEntry *entry)
 		elog(DEBUG3, "starting remote transaction on connection %p",
 			 entry->conn);
 
+		if (UseGlobalSnapshots && (!IsolationUsesXactSnapshot() ||
+								   IsolationIsSerializable()))
+			elog(ERROR, "Global snapshots support only REPEATABLE READ");
+
 		if (IsolationIsSerializable())
 			sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
 		else
@@ -492,6 +530,23 @@ begin_remote_xact(ConnCacheEntry *entry)
 		do_sql_command(entry->conn, sql);
 		entry->xact_depth = 1;
 		entry->changing_xact_state = false;
+
+		if (UseGlobalSnapshots)
+		{
+			char import_sql[128];
+
+			/* Export our snapshot */
+			if (fdwTransState->global_csn == 0)
+				fdwTransState->global_csn = ExportGlobalSnapshot();
+
+			snprintf(import_sql, sizeof(import_sql),
+				"SELECT pg_global_snapshot_import("UINT64_FORMAT")",
+				fdwTransState->global_csn);
+
+			do_sql_command(entry->conn, import_sql);
+		}
+
+		fdwTransState->nparticipants += 1;
 	}
 
 	/*
@@ -699,6 +754,94 @@ pgfdw_report_error(int elevel, PGresult *res, PGconn *conn,
 	PG_END_TRY();
 }
 
+/* Callback typedef for BroadcastStmt */
+typedef bool (*BroadcastCmdResHandler) (PGresult *result, void *arg);
+
+/* Broadcast sql in parallel to all ConnectionHash entries */
+static bool
+BroadcastStmt(char const * sql, unsigned expectedStatus,
+				BroadcastCmdResHandler handler, void *arg)
+{
+	HASH_SEQ_STATUS scan;
+	ConnCacheEntry *entry;
+	bool		allOk = true;
+
+	/* Broadcast sql */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		pgfdw_reject_incomplete_xact_state_change(entry);
+
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			if (!PQsendQuery(entry->conn, sql))
+			{
+				PGresult   *res = PQgetResult(entry->conn);
+
+				elog(WARNING, "Failed to send command %s", sql);
+				pgfdw_report_error(WARNING, res, entry->conn, true, sql);
+				PQclear(res);
+			}
+		}
+	}
+
+	/* Collect responses */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			PGresult   *result = PQgetResult(entry->conn);
+
+			if (PQresultStatus(result) != expectedStatus ||
+				(handler && !handler(result, arg)))
+			{
+				elog(WARNING, "Failed command %s: status=%d, expected status=%d", sql, PQresultStatus(result), expectedStatus);
+				pgfdw_report_error(ERROR, result, entry->conn, true, sql);
+				allOk = false;
+			}
+			PQclear(result);
+			PQgetResult(entry->conn);	/* consume NULL result */
+		}
+	}
+
+	return allOk;
+}
+
+/* Wrapper for broadcasting commands */
+static bool
+BroadcastCmd(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_COMMAND_OK, NULL, NULL);
+}
+
+/* Wrapper for broadcasting statements */
+static bool
+BroadcastFunc(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_TUPLES_OK, NULL, NULL);
+}
+
+/* Callback for selecting maximal csn */
+static bool
+MaxCsnCB(PGresult *result, void *arg)
+{
+	char		   *resp;
+	GlobalCSN	   *max_csn = (GlobalCSN *) arg;
+	GlobalCSN		csn = 0;
+
+	resp = PQgetvalue(result, 0, 0);
+
+	if (resp == NULL || (*resp) == '\0' ||
+			sscanf(resp, UINT64_FORMAT, &csn) != 1)
+		return false;
+
+	if (*max_csn < csn)
+		*max_csn = csn;
+
+	return true;
+}
+
 /*
  * pgfdw_xact_callback --- cleanup at main-transaction end.
  */
@@ -712,6 +855,86 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	if (!xact_got_connection)
 		return;
 
+	/* Handle possible two-phase commit */
+	if (event == XACT_EVENT_PARALLEL_PRE_COMMIT || event == XACT_EVENT_PRE_COMMIT)
+	{
+		bool include_local_tx = false;
+
+		/* Should we take into account this node? */
+		if (TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		{
+			include_local_tx = true;
+			fdwTransState->nparticipants += 1;
+		}
+
+		/* Switch to 2PC mode there were more than one participant */
+		if (UseGlobalSnapshots && fdwTransState->nparticipants > 1)
+			fdwTransState->two_phase_commit = true;
+
+		if (fdwTransState->two_phase_commit)
+		{
+			GlobalCSN	max_csn = InProgressGlobalCSN,
+						my_csn = InProgressGlobalCSN;
+			bool	res;
+			char   *sql;
+
+			fdwTransState->gid = psprintf("pgfdw:%lld:%llu:%d:%u:%d:%d",
+										  (long long) GetCurrentTimestamp(),
+										  (long long) GetSystemIdentifier(),
+										  MyProcPid,
+										  GetCurrentTransactionIdIfAny(),
+										  ++two_phase_xact_count,
+										  fdwTransState->nparticipants);
+
+			/* Broadcast PREPARE */
+			sql = psprintf("PREPARE TRANSACTION '%s'", fdwTransState->gid);
+			res = BroadcastCmd(sql);
+			if (!res)
+				goto error;
+
+			/* Broadcast pg_global_snapshot_prepare() */
+			if (include_local_tx)
+				my_csn = GlobalSnapshotPrepareCurrent();
+
+			sql = psprintf("SELECT pg_global_snapshot_prepare('%s')",
+														fdwTransState->gid);
+			res = BroadcastStmt(sql, PGRES_TUPLES_OK, MaxCsnCB, &max_csn);
+			if (!res)
+				goto error;
+
+			/* select maximal global csn */
+			if (include_local_tx && my_csn > max_csn)
+				max_csn = my_csn;
+
+			/* Broadcast pg_global_snapshot_assign() */
+			if (include_local_tx)
+				GlobalSnapshotAssignCsnCurrent(max_csn);
+			sql = psprintf("SELECT pg_global_snapshot_assign('%s',"UINT64_FORMAT")",
+							fdwTransState->gid, max_csn);
+			res = BroadcastFunc(sql);
+
+error:
+			if (!res)
+			{
+				sql = psprintf("ABORT PREPARED '%s'", fdwTransState->gid);
+				BroadcastCmd(sql);
+				elog(ERROR, "Failed to PREPARE transaction on remote node");
+			}
+
+			/*
+			 * Do not fall down. Consequent COMMIT event will clean thing up.
+			 */
+			return;
+		}
+	}
+
+	/* COMMIT open transaction of we were doing 2PC */
+	if (fdwTransState->two_phase_commit &&
+		(event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+	{
+		BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid));
+	}
+
 	/*
 	 * Scan all connection cache entries to find open remote transactions, and
 	 * close them.
@@ -719,8 +942,6 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	hash_seq_init(&scan, ConnectionHash);
 	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
 	{
-		PGresult   *res;
-
 		/* Ignore cache entry if no open connection right now */
 		if (entry->conn == NULL)
 			continue;
@@ -737,6 +958,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 			{
 				case XACT_EVENT_PARALLEL_PRE_COMMIT:
 				case XACT_EVENT_PRE_COMMIT:
+					Assert(!fdwTransState->two_phase_commit);
 
 					/*
 					 * If abort cleanup previously failed for this connection,
@@ -749,28 +971,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					do_sql_command(entry->conn, "COMMIT TRANSACTION");
 					entry->changing_xact_state = false;
 
-					/*
-					 * If there were any errors in subtransactions, and we
-					 * made prepared statements, do a DEALLOCATE ALL to make
-					 * sure we get rid of all prepared statements. This is
-					 * annoying and not terribly bulletproof, but it's
-					 * probably not worth trying harder.
-					 *
-					 * DEALLOCATE ALL only exists in 8.3 and later, so this
-					 * constrains how old a server postgres_fdw can
-					 * communicate with.  We intentionally ignore errors in
-					 * the DEALLOCATE, so that we can hobble along to some
-					 * extent with older servers (leaking prepared statements
-					 * as we go; but we don't really support update operations
-					 * pre-8.3 anyway).
-					 */
-					if (entry->have_prep_stmt && entry->have_error)
-					{
-						res = PQexec(entry->conn, "DEALLOCATE ALL");
-						PQclear(res);
-					}
-					entry->have_prep_stmt = false;
-					entry->have_error = false;
+					deallocate_prepared_stmts(entry);
 					break;
 				case XACT_EVENT_PRE_PREPARE:
 
@@ -789,6 +990,11 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					break;
 				case XACT_EVENT_PARALLEL_COMMIT:
 				case XACT_EVENT_COMMIT:
+					if (fdwTransState->two_phase_commit)
+						deallocate_prepared_stmts(entry);
+					else /* Pre-commit should have closed the open transaction */
+						elog(ERROR, "missed cleaning up connection during pre-commit");
+					break;
 				case XACT_EVENT_PREPARE:
 					/* Pre-commit should have closed the open transaction */
 					elog(ERROR, "missed cleaning up connection during pre-commit");
@@ -884,6 +1090,38 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Reset fdwTransState */
+	memset(fdwTransState, '\0', sizeof(FdwTransactionState));
+}
+
+/*
+ * If there were any errors in subtransactions, and we
+ * made prepared statements, do a DEALLOCATE ALL to make
+ * sure we get rid of all prepared statements. This is
+ * annoying and not terribly bulletproof, but it's
+ * probably not worth trying harder.
+ *
+ * DEALLOCATE ALL only exists in 8.3 and later, so this
+ * constrains how old a server postgres_fdw can
+ * communicate with.  We intentionally ignore errors in
+ * the DEALLOCATE, so that we can hobble along to some
+ * extent with older servers (leaking prepared statements
+ * as we go; but we don't really support update operations
+ * pre-8.3 anyway).
+ */
+static void
+deallocate_prepared_stmts(ConnCacheEntry *entry)
+{
+	PGresult   *res;
+
+	if (entry->have_prep_stmt && entry->have_error)
+	{
+		res = PQexec(entry->conn, "DEALLOCATE ALL");
+		PQclear(res);
+	}
+	entry->have_prep_stmt = false;
+	entry->have_error = false;
 }
 
 /*
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 9fc53cad68..03c5b0093a 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -301,6 +301,9 @@ typedef struct
 	List	   *already_used;	/* expressions already dealt with */
 } ec_member_foreign_arg;
 
+bool		UseGlobalSnapshots;
+void		_PG_init(void);
+
 /*
  * SQL functions
  */
@@ -6584,3 +6587,12 @@ find_em_expr_for_input_target(PlannerInfo *root,
 	elog(ERROR, "could not find pathkey item to sort");
 	return NULL;				/* keep compiler quiet */
 }
+
+void
+_PG_init(void)
+{
+	DefineCustomBoolVariable("postgres_fdw.use_global_snapshots",
+							 "Use global snapshots for FDW transactions", NULL,
+							 &UseGlobalSnapshots, false, PGC_USERSET, 0, NULL,
+							 NULL, NULL);
+}
diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h
index eef410db39..9d3ea077a1 100644
--- a/contrib/postgres_fdw/postgres_fdw.h
+++ b/contrib/postgres_fdw/postgres_fdw.h
@@ -208,4 +208,6 @@ extern const char *get_jointype_name(JoinType jointype);
 extern bool is_builtin(Oid objectId);
 extern bool is_shippable(Oid objectId, Oid classId, PgFdwRelationInfo *fpinfo);
 
+extern bool UseGlobalSnapshots;
+
 #endif							/* POSTGRES_FDW_H */
diff --git a/contrib/postgres_fdw/t/001_bank_coordinator.pl b/contrib/postgres_fdw/t/001_bank_coordinator.pl
new file mode 100644
index 0000000000..1e31f33349
--- /dev/null
+++ b/contrib/postgres_fdw/t/001_bank_coordinator.pl
@@ -0,0 +1,264 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $master = get_new_node("master");
+$master->init;
+$master->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	log_checkpoints = true
+	postgres_fdw.use_global_snapshots = on
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$master->start;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+$master->safe_psql('postgres', qq[
+	CREATE EXTENSION postgres_fdw;
+	CREATE TABLE accounts(id integer primary key, amount integer);
+	CREATE TABLE global_transactions(tx_time timestamp);
+]);
+
+foreach my $node ($shard1, $shard2)
+{
+	my $port = $node->port;
+	my $host = $node->host;
+
+	$node->safe_psql('postgres',
+			"CREATE TABLE accounts(id integer primary key, amount integer)");
+
+	$master->safe_psql('postgres', qq[
+		CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw options(dbname 'postgres', host '$host', port '$port');
+		CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts) server shard_$port options(table_name 'accounts');
+		CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+	])
+}
+
+$shard1->safe_psql('postgres', qq[
+	insert into accounts select 2*id-1, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+$shard2->safe_psql('postgres', qq[
+	insert into accounts select 2*id, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+diag("master: @{[$master->connstr('postgres')]}");
+diag("shard1: @{[$shard1->connstr('postgres')]}");
+diag("shard2: @{[$shard2->connstr('postgres')]}");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+my $bank1 = File::Temp->new();
+append_to_file($bank1, q{
+	\set id random(1, 10000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = (2*:id + 1) RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 3);
+	COMMIT;
+});
+
+my $bank2 = File::Temp->new();
+append_to_file($bank2, q{
+	\set id random(1, 10000)
+
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = 2*:id RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 2);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+
+
+my $pgb_handle;
+
+$pgb_handle = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+$master->pgbench_await($pgb_handle);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# Concurrent global and local transactions
+###############################################################################
+
+my ($pgb_handle1, $pgb_handle2, $pgb_handle3);
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$started = time();
+$selects = 0;
+$oldtotal = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+diag("selects = $selects");
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+diag("completed $selects selects");
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global and local transactions');
+
+
+###############################################################################
+# Snapshot stability
+###############################################################################
+
+my ($hashes, $hash1, $hash2);
+my $stability_errors = 0;
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$selects = 0;
+$started = time();
+while (time() - $started < $seconds)
+{
+	foreach my $node ($master, $shard1, $shard2)
+	{
+		($hash1, $_, $hash2) = split "\n", $node->safe_psql('postgres', qq[
+			begin isolation level repeatable read;
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			select pg_sleep(3);
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			commit;
+		]);
+
+		if ($hash1 ne $hash2)
+		{
+			diag("oops");
+			$stability_errors++;
+		}
+		elsif ($hash1 eq '' or $hash2 eq '')
+		{
+			die;
+		}
+		else
+		{
+			$selects++;
+		}
+	}
+}
+
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($stability_errors, 0, 'snapshot is stable during concurrent global and local transactions');
+
+$master->stop;
+$shard1->stop;
+$shard2->stop;
diff --git a/contrib/postgres_fdw/t/002_bank_participant.pl b/contrib/postgres_fdw/t/002_bank_participant.pl
new file mode 100644
index 0000000000..04a2f1ba85
--- /dev/null
+++ b/contrib/postgres_fdw/t/002_bank_participant.pl
@@ -0,0 +1,240 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_global_snapshots = on
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_global_snapshots = on
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+my @shards = ($shard1, $shard2);
+
+foreach my $node (@shards)
+{
+	$node->safe_psql('postgres', qq[
+		CREATE EXTENSION postgres_fdw;
+		CREATE TABLE accounts(id integer primary key, amount integer);
+		CREATE TABLE accounts_local() inherits(accounts);
+		CREATE TABLE global_transactions(tx_time timestamp);
+		CREATE TABLE local_transactions(tx_time timestamp);
+	]);
+
+	foreach my $neighbor (@shards)
+	{
+		next if ($neighbor eq $node);
+
+		my $port = $neighbor->port;
+		my $host = $neighbor->host;
+
+		$node->safe_psql('postgres', qq[
+			CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw
+					options(dbname 'postgres', host '$host', port '$port');
+			CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts)
+					server shard_$port options(table_name 'accounts_local');
+			CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+		]);
+	}
+}
+
+$shard1->psql('postgres', "insert into accounts_local select 2*id-1, 0 from generate_series(1, 10010) as id;");
+$shard2->psql('postgres', "insert into accounts_local select 2*id,   0 from generate_series(1, 10010) as id;");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+my $i;
+
+
+my ($pgb_handle1, $pgb_handle2);
+
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# And do the same after soft restart
+###############################################################################
+
+$shard1->restart;
+$shard2->restart;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after restart');
+
+###############################################################################
+# And do the same after hard restart
+###############################################################################
+
+$shard1->teardown_node;
+$shard2->teardown_node;
+$shard1->start;
+$shard2->start;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after hard restart');
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 1d5450758e..ef4472170c 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -2115,6 +2115,41 @@ sub pg_recvlogical_upto
 	}
 }
 
+sub pgbench()
+{
+	my ($self, $node, @args) = @_;
+	my $pgbench_handle = $self->pgbench_async($node, @args);
+	$self->pgbench_await($pgbench_handle);
+}
+
+sub pgbench_async()
+{
+	my ($self, @args) = @_;
+
+	my ($in, $out, $err, $rc);
+	$in = '';
+	$out = '';
+
+	my @pgbench_command = (
+		'pgbench',
+		-h => $self->host,
+		-p => $self->port,
+		@args
+	);
+	my $handle = IPC::Run::start(\@pgbench_command, $in, $out);
+	return $handle;
+}
+
+sub pgbench_await()
+{
+	my ($self, $pgbench_handle) = @_;
+
+	# During run some pgbench threads can exit (for example due to
+	# serialization error). That will set non-zero returning code.
+	# So don't check return code here and leave it to a caller.
+	my $rc = IPC::Run::finish($pgbench_handle);
+}
+
 =pod
 
 =back
-- 
2.17.1

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: Andrey Lepikhov (#1)

On 2020/05/12 19:24, Andrey Lepikhov wrote:

Rebased onto current master (fb544735f1).

Thanks for the patches!

These patches are no longer applied cleanly and caused the compilation failure.
So could you rebase and update them?

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

over 5 years ago

In reply to: Fujii Masao (#2)

3 attachment(s)

On 09.06.2020 11:41, Fujii Masao wrote:

On 2020/05/12 19:24, Andrey Lepikhov wrote:

Rebased onto current master (fb544735f1).

Thanks for the patches!

These patches are no longer applied cleanly and caused the compilation
failure.
So could you rebase and update them?

Rebased onto 57cb806308 (see attachment).

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Not now. It is a sharding-related feature. I'm not sure that this
approach is fully consistent with the sharding way now.

--
Andrey Lepikhov
Postgres Professional
https://postgrespro.com

Attachments:

0001-GlobalCSNLog-SLRU.patchtext/x-patch; charset=UTF-8; name=0001-GlobalCSNLog-SLRU.patchDownload

From cd6a8585f9814b7e465abb2649ac84e80e7c726b Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Tue, 9 Jun 2020 14:55:38 +0500
Subject: [PATCH 1/3] GlobalCSNLog-SLRU

---
 src/backend/access/transam/Makefile         |   1 +
 src/backend/access/transam/global_csn_log.c | 439 ++++++++++++++++++++
 src/backend/access/transam/twophase.c       |   1 +
 src/backend/access/transam/varsup.c         |   2 +
 src/backend/access/transam/xlog.c           |  12 +
 src/backend/storage/ipc/ipci.c              |   3 +
 src/backend/storage/ipc/procarray.c         |   3 +
 src/backend/storage/lmgr/lwlocknames.txt    |   1 +
 src/backend/tcop/postgres.c                 |   1 +
 src/backend/utils/misc/guc.c                |   9 +
 src/backend/utils/probes.d                  |   2 +
 src/bin/initdb/initdb.c                     |   3 +-
 src/include/access/global_csn_log.h         |  30 ++
 src/include/storage/lwlock.h                |   1 +
 src/include/utils/snapshot.h                |   3 +
 15 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 src/backend/access/transam/global_csn_log.c
 create mode 100644 src/include/access/global_csn_log.h

diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de72..60ff8b141e 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	global_csn_log.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/global_csn_log.c b/src/backend/access/transam/global_csn_log.c
new file mode 100644
index 0000000000..6f7fded350
--- /dev/null
+++ b/src/backend/access/transam/global_csn_log.c
@@ -0,0 +1,439 @@
+/*-----------------------------------------------------------------------------
+ *
+ * global_csn_log.c
+ *		Track global commit sequence numbers of finished transactions
+ *
+ * Implementation of cross-node transaction isolation relies on commit sequence
+ * number (CSN) based visibility rules.  This module provides SLRU to store
+ * CSN for each transaction.  This mapping need to be kept only for xid's
+ * greater then oldestXid, but that can require arbitrary large amounts of
+ * memory in case of long-lived transactions.  Because of same lifetime and
+ * persistancy requirements this module is quite similar to subtrans.c
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/global_csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/global_csn_log.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+bool track_global_snapshots;
+
+/*
+ * Defines for GlobalCSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * GlobalCSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/GLOBAL_CSN_LOG_XACTS_PER_PAGE, and GlobalCSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateGlobalCSNLog (see GlobalCSNLogPagePrecedes).
+ */
+
+/* We store the commit GlobalCSN for each xid */
+#define GCSNLOG_XACTS_PER_PAGE (BLCKSZ / sizeof(GlobalCSN))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) GCSNLOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) GCSNLOG_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData GlobalCSNLogCtlData;
+#define GlobalCsnlogCtl (&GlobalCSNLogCtlData)
+
+static int	ZeroGlobalCSNLogPage(int pageno);
+static bool GlobalCSNLogPagePrecedes(int page1, int page2);
+static void GlobalCSNLogSetPageStatus(TransactionId xid, int nsubxids,
+									  TransactionId *subxids,
+									  GlobalCSN csn, int pageno);
+static void GlobalCSNLogSetCSNInSlot(TransactionId xid, GlobalCSN csn,
+									  int slotno);
+
+/*
+ * GlobalCSNLogSetCSN
+ *
+ * Record GlobalCSN of transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transactionid for a top level commit or abort. It can also be a
+ * subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * AbortedGlobalCSN for abort cases.
+ */
+void
+GlobalCSNLogSetCSN(TransactionId xid, int nsubxids,
+					 TransactionId *subxids, GlobalCSN csn)
+{
+	int			pageno;
+	int			i = 0;
+	int			offset = 0;
+
+	/* Callers of GlobalCSNLogSetCSN() must check GUC params */
+	Assert(track_global_snapshots);
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);		/* get page of parent */
+	for (;;)
+	{
+		int			num_on_page = 0;
+
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		GlobalCSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							csn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+GlobalCSNLogSetPageStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids,
+						   GlobalCSN csn, int pageno)
+{
+	int			slotno;
+	int			i;
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(GlobalCsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(GlobalCsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		GlobalCSNLogSetCSNInSlot(subxids[i],	csn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		GlobalCSNLogSetCSNInSlot(xid, csn, slotno);
+
+	GlobalCsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+GlobalCSNLogSetCSNInSlot(TransactionId xid, GlobalCSN csn, int slotno)
+{
+	int			entryno = TransactionIdToPgIndex(xid);
+	GlobalCSN *ptr;
+
+	Assert(LWLockHeldByMe(GlobalCSNLogControlLock));
+
+	ptr = (GlobalCSN *) (GlobalCsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetGlobalCSN() in global_snapshot.c is the
+ * intended caller.
+ */
+GlobalCSN
+GlobalCSNLogGetCSN(TransactionId xid)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToPgIndex(xid);
+	int			slotno;
+	GlobalCSN *ptr;
+	GlobalCSN	global_csn;
+
+	/* Callers of GlobalCSNLogGetCSN() must check GUC params */
+	Assert(track_global_snapshots);
+
+	/* Can't ask about stuff that might not be around anymore */
+	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+	slotno = SimpleLruReadPage_ReadOnly(GlobalCsnlogCtl, pageno, xid);
+	ptr = (GlobalCSN *) (GlobalCsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+	global_csn = *ptr;
+
+	LWLockRelease(GlobalCSNLogControlLock);
+
+	return global_csn;
+}
+
+/*
+ * Number of shared GlobalCSNLog buffers.
+ */
+static Size
+GlobalCSNLogShmemBuffers(void)
+{
+	return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for GlobalCsnlogCtl.
+ */
+Size
+GlobalCSNLogShmemSize(void)
+{
+	if (!track_global_snapshots)
+		return 0;
+
+	return SimpleLruShmemSize(GlobalCSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for GlobalCSNLog.
+ */
+void
+GlobalCSNLogShmemInit(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	GlobalCsnlogCtl->PagePrecedes = GlobalCSNLogPagePrecedes;
+	SimpleLruInit(GlobalCsnlogCtl, "GlobalCSNLog Ctl", GlobalCSNLogShmemBuffers(), 0,
+				  GlobalCSNLogControlLock, "pg_global_csn", LWTRANCHE_GLOBAL_CSN_LOG_BUFFERS);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates the initial
+ * GlobalCSNLog segment.  The pg_global_csn directory is assumed to have been
+ * created by initdb, and GlobalCSNLogShmemInit must have been called already.
+ */
+void
+BootStrapGlobalCSNLog(void)
+{
+	int			slotno;
+
+	if (!track_global_snapshots)
+		return;
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the commit log */
+	slotno = ZeroGlobalCSNLogPage(0);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(GlobalCsnlogCtl, slotno);
+	Assert(!GlobalCsnlogCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of GlobalCSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroGlobalCSNLogPage(int pageno)
+{
+	Assert(LWLockHeldByMe(GlobalCSNLogControlLock));
+	return SimpleLruZeroPage(GlobalCsnlogCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ */
+void
+StartupGlobalCSNLog(TransactionId oldestActiveXID)
+{
+	int			startPage;
+	int			endPage;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Since we don't expect pg_global_csn to be valid across crashes, we
+	 * initialize the currently-active page(s) to zeroes during startup.
+	 * Whenever we advance into a new page, ExtendGlobalCSNLog will likewise
+	 * zero the new page without regard to whatever was previously on disk.
+	 */
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	startPage = TransactionIdToPage(oldestActiveXID);
+	endPage = TransactionIdToPage(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
+
+	while (startPage != endPage)
+	{
+		(void) ZeroGlobalCSNLogPage(startPage);
+		startPage++;
+		/* must account for wraparound */
+		if (startPage > TransactionIdToPage(MaxTransactionId))
+			startPage = 0;
+	}
+	(void) ZeroGlobalCSNLogPage(startPage);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownGlobalCSNLog(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Flush dirty GlobalCSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely as a debugging aid.
+	 */
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_START(false);
+	SimpleLruFlush(GlobalCsnlogCtl, false);
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_DONE(false);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointGlobalCSNLog(void)
+{
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * Flush dirty GlobalCSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_START(true);
+	SimpleLruFlush(GlobalCsnlogCtl, true);
+	TRACE_POSTGRESQL_GLOBALCSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that GlobalCSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendGlobalCSNLog(TransactionId newestXact)
+{
+	int			pageno;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(GlobalCSNLogControlLock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroGlobalCSNLogPage(pageno);
+
+	LWLockRelease(GlobalCSNLogControlLock);
+}
+
+/*
+ * Remove all GlobalCSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateGlobalCSNLog(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	if (!track_global_snapshots)
+		return;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	SimpleLruTruncate(GlobalCsnlogCtl, cutoffPage);
+}
+
+/*
+ * Decide which of two GlobalCSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+GlobalCSNLogPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * GCSNLOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * GCSNLOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index e1904877fa..9a69fc1e09 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index e14b53bf9e..3de3d99683 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -173,6 +174,7 @@ GetNewTransactionId(bool isSubXact)
 	 * Extend pg_subtrans and pg_commit_ts too.
 	 */
 	ExtendCLOG(xid);
+	ExtendGlobalCSNLog(xid);
 	ExtendCommitTs(xid);
 	ExtendSUBTRANS(xid);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 55cac186dc..4ffe4aad03 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -5342,6 +5343,7 @@ BootStrapXLOG(void)
 
 	/* Bootstrap the commit log, too */
 	BootStrapCLOG();
+	BootStrapGlobalCSNLog();
 	BootStrapCommitTs();
 	BootStrapSUBTRANS();
 	BootStrapMultiXact();
@@ -7059,6 +7061,7 @@ StartupXLOG(void)
 			 * maintained during recovery and need not be started yet.
 			 */
 			StartupCLOG();
+			StartupGlobalCSNLog(oldestActiveXID);
 			StartupSUBTRANS(oldestActiveXID);
 
 			/*
@@ -7876,6 +7879,7 @@ StartupXLOG(void)
 	if (standbyState == STANDBY_DISABLED)
 	{
 		StartupCLOG();
+		StartupGlobalCSNLog(oldestActiveXID);
 		StartupSUBTRANS(oldestActiveXID);
 	}
 
@@ -8523,6 +8527,7 @@ ShutdownXLOG(int code, Datum arg)
 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	}
 	ShutdownCLOG();
+	ShutdownGlobalCSNLog();
 	ShutdownCommitTs();
 	ShutdownSUBTRANS();
 	ShutdownMultiXact();
@@ -9095,7 +9100,10 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
+	{
 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateGlobalCSNLog(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+	}
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -9171,6 +9179,7 @@ static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointCLOG();
+	CheckPointGlobalCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -9455,7 +9464,10 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
+	{
 		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateGlobalCSNLog(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+	}
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 427b0d59cd..dc2d2959c4 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/global_csn_log.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -125,6 +126,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, ProcGlobalShmemSize());
 		size = add_size(size, XLOGShmemSize());
 		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, GlobalCSNLogShmemSize());
 		size = add_size(size, CommitTsShmemSize());
 		size = add_size(size, SUBTRANSShmemSize());
 		size = add_size(size, TwoPhaseShmemSize());
@@ -213,6 +215,7 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	GlobalCSNLogShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3c2b369615..486da77f68 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -46,6 +46,7 @@
 #include <signal.h>
 
 #include "access/clog.h"
+#include "access/global_csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -833,6 +834,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
 	{
 		ExtendSUBTRANS(latestObservedXid);
+		ExtendGlobalCSNLog(latestObservedXid);
 		TransactionIdAdvance(latestObservedXid);
 	}
 	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
@@ -3335,6 +3337,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
+			ExtendGlobalCSNLog(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6985e8eed..aa904b1f17 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,4 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 XactTruncationLock					44
+GlobalCSNLogControlLock				45
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index c9424f167c..9fec9dcd59 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -42,6 +42,7 @@
 #include "catalog/pg_type.h"
 #include "commands/async.h"
 #include "commands/prepare.h"
+#include "common/hashfn.h"
 #include "executor/spi.h"
 #include "jit/jit.h"
 #include "libpq/libpq.h"
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2f3e0a70e0..4910e4fc66 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1172,6 +1172,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_global_snapshots", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable global snapshot tracking."),
+			gettext_noop("Used to achieve REPEATEBLE READ isolation level for postgres_fdw transactions.")
+		},
+		&track_global_snapshots,
+		true, /* XXX: set true to simplify tesing. XXX2: Seems that RESOURCES_MEM isn't the best catagory */
+		NULL, NULL, NULL
+	},
 	{
 		{"ssl", PGC_SIGHUP, CONN_AUTH_SSL,
 			gettext_noop("Enables SSL connections."),
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index a0b0458108..f900e7f3b4 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe globalcsnlog__checkpoint__start(bool);
+	probe globalcsnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 4ff0c6c700..68c44d3f70 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -220,7 +220,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_global_csn"
 };
 
 
diff --git a/src/include/access/global_csn_log.h b/src/include/access/global_csn_log.h
new file mode 100644
index 0000000000..618edfc691
--- /dev/null
+++ b/src/include/access/global_csn_log.h
@@ -0,0 +1,30 @@
+/*
+ * global_csn_log.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/global_csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "utils/snapshot.h"
+
+extern void GlobalCSNLogSetCSN(TransactionId xid, int nsubxids,
+							   TransactionId *subxids, GlobalCSN csn);
+extern GlobalCSN GlobalCSNLogGetCSN(TransactionId xid);
+
+extern Size GlobalCSNLogShmemSize(void);
+extern void GlobalCSNLogShmemInit(void);
+extern void BootStrapGlobalCSNLog(void);
+extern void StartupGlobalCSNLog(TransactionId oldestActiveXID);
+extern void ShutdownGlobalCSNLog(void);
+extern void CheckPointGlobalCSNLog(void);
+extern void ExtendGlobalCSNLog(TransactionId newestXact);
+extern void TruncateGlobalCSNLog(TransactionId oldestXact);
+
+#endif   /* CSNLOG_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index c04ae97148..0d56f6de61 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -197,6 +197,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFER,
 	LWTRANCHE_SUBTRANS_BUFFER,
+	LWTRANCHE_GLOBAL_CSN_LOG_BUFFERS,
 	LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 	LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 	LWTRANCHE_NOTIFY_BUFFER,
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 4796edb63a..57d2dfaa67 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -20,6 +20,9 @@
 #include "storage/buf.h"
 
 
+typedef uint64 GlobalCSN;
+extern bool track_global_snapshots;
+
 /*
  * The different snapshot types.  We use SnapshotData structures to represent
  * both "regular" (MVCC) snapshots and "special" snapshots that have non-MVCC
-- 
2.25.1

0002-Global-snapshots.patchtext/x-patch; charset=UTF-8; name=0002-Global-snapshots.patchDownload

From 1e27ac62763e112810db7b6279d0862e962db403 Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Tue, 9 Jun 2020 15:02:39 +0500
Subject: [PATCH 2/3] Global-snapshots

---
 src/backend/access/transam/Makefile           |   1 +
 src/backend/access/transam/global_snapshot.c  | 755 ++++++++++++++++++
 src/backend/access/transam/twophase.c         | 156 ++++
 src/backend/access/transam/xact.c             |  29 +
 src/backend/access/transam/xlog.c             |   2 +
 src/backend/storage/ipc/ipci.c                |   3 +
 src/backend/storage/ipc/procarray.c           |  92 ++-
 src/backend/storage/lmgr/lwlocknames.txt      |   1 +
 src/backend/storage/lmgr/proc.c               |   5 +
 src/backend/utils/misc/guc.c                  |  13 +-
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/backend/utils/time/snapmgr.c              | 167 +++-
 src/include/access/global_snapshot.h          |  72 ++
 src/include/access/twophase.h                 |   1 +
 src/include/catalog/pg_proc.dat               |  13 +
 src/include/datatype/timestamp.h              |   3 +
 src/include/fmgr.h                            |   1 +
 src/include/portability/instr_time.h          |  10 +
 src/include/storage/proc.h                    |  15 +
 src/include/storage/procarray.h               |   8 +
 src/include/utils/snapmgr.h                   |   3 +
 src/include/utils/snapshot.h                  |   8 +
 22 files changed, 1353 insertions(+), 7 deletions(-)
 create mode 100644 src/backend/access/transam/global_snapshot.c
 create mode 100644 src/include/access/global_snapshot.h

diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 60ff8b141e..6de567a79b 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	clog.o \
 	commit_ts.o \
 	global_csn_log.o \
+	global_snapshot.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/global_snapshot.c b/src/backend/access/transam/global_snapshot.c
new file mode 100644
index 0000000000..bac16828bb
--- /dev/null
+++ b/src/backend/access/transam/global_snapshot.c
@@ -0,0 +1,755 @@
+/*-------------------------------------------------------------------------
+ *
+ * global_snapshot.c
+ *		Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/global_snapshot.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/global_csn_log.h"
+#include "access/global_snapshot.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "portability/instr_time.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+#include "miscadmin.h"
+
+/* Raise a warning if imported global_csn exceeds ours by this value. */
+#define SNAP_DESYNC_COMPLAIN (1*NSECS_PER_SEC) /* 1 second */
+
+/*
+ * GlobalSnapshotState
+ *
+ * Do not trust local clocks to be strictly monotonical and save last acquired
+ * value so later we can compare next timestamp with it. Accessed through
+ * GlobalSnapshotGenerate() and GlobalSnapshotSync().
+ */
+typedef struct
+{
+	GlobalCSN		 last_global_csn;
+	volatile slock_t lock;
+} GlobalSnapshotState;
+
+static GlobalSnapshotState *gsState;
+
+
+/*
+ * GUC to delay advance of oldestXid for this amount of time. Also determines
+ * the size GlobalSnapshotXidMap circular buffer.
+ */
+int global_snapshot_defer_time;
+
+/*
+ * Enables this module.
+ */
+extern bool track_global_snapshots;
+
+/*
+ * GlobalSnapshotXidMap
+ *
+ * To be able to install global snapshot that points to past we need to keep
+ * old versions of tuples and therefore delay advance of oldestXid.  Here we
+ * keep track of correspondence between snapshot's global_csn and oldestXid
+ * that was set at the time when the snapshot was taken.  Much like the
+ * snapshot too old's OldSnapshotControlData does, but with finer granularity
+ * to seconds.
+ *
+ * Different strategies can be employed to hold oldestXid (e.g. we can track
+ * oldest global_csn-based snapshot among cluster nodes and map it oldestXid
+ * on each node) but here implemented one that tries to avoid cross-node
+ * communications which are tricky in case of postgres_fdw.
+ *
+ * On each snapshot acquisition GlobalSnapshotMapXmin() is called and stores
+ * correspondence between current global_csn and oldestXmin in a sparse way:
+ * global_csn is rounded to seconds (and here we use the fact that global_csn
+ * is just a timestamp) and oldestXmin is stored in the circular buffer where
+ * rounded global_csn acts as an offset from current circular buffer head.
+ * Size of the circular buffer is controlled by global_snapshot_defer_time GUC.
+ *
+ * When global snapshot arrives from different node we check that its
+ * global_csn is still in our map, otherwise we'll error out with "snapshot too
+ * old" message.  If global_csn is successfully mapped to oldestXid we move
+ * backend's pgxact->xmin to proc->originalXmin and fill pgxact->xmin to
+ * mapped oldestXid.  That way GetOldestXmin() can take into account backends
+ * with imported global snapshot and old tuple versions will be preserved.
+ *
+ * Also while calculating oldestXmin for our map in presence of imported
+ * global snapshots we should use proc->originalXmin instead of pgxact->xmin
+ * that was set during import.  Otherwise, we can create a feedback loop:
+ * xmin's of imported global snapshots were calculated using our map and new
+ * entries in map going to be calculated based on that xmin's, and there is
+ * a risk to stuck forever with one non-increasing oldestXmin.  All other
+ * callers of GetOldestXmin() are using pgxact->xmin so the old tuple versions
+ * are preserved.
+ */
+typedef struct GlobalSnapshotXidMap
+{
+	int				 head;				/* offset of current freshest value */
+	int				 size;				/* total size of circular buffer */
+	GlobalCSN_atomic last_csn_seconds;	/* last rounded global_csn that changed
+										 * xmin_by_second[] */
+	TransactionId   *xmin_by_second;	/* circular buffer of oldestXmin's */
+}
+GlobalSnapshotXidMap;
+
+static GlobalSnapshotXidMap *gsXidMap;
+
+
+/* Estimate shared memory space needed */
+Size
+GlobalSnapshotShmemSize(void)
+{
+	Size	size = 0;
+
+	if (track_global_snapshots || global_snapshot_defer_time > 0)
+	{
+		size += MAXALIGN(sizeof(GlobalSnapshotState));
+	}
+
+	if (global_snapshot_defer_time > 0)
+	{
+		size += sizeof(GlobalSnapshotXidMap);
+		size += global_snapshot_defer_time*sizeof(TransactionId);
+		size = MAXALIGN(size);
+	}
+
+	return size;
+}
+
+/* Init shared memory structures */
+void
+GlobalSnapshotShmemInit()
+{
+	bool found;
+
+	if (track_global_snapshots || global_snapshot_defer_time > 0)
+	{
+		gsState = ShmemInitStruct("gsState",
+								sizeof(GlobalSnapshotState),
+								&found);
+		if (!found)
+		{
+			gsState->last_global_csn = 0;
+			SpinLockInit(&gsState->lock);
+		}
+	}
+
+	if (global_snapshot_defer_time > 0)
+	{
+		gsXidMap = ShmemInitStruct("gsXidMap",
+								   sizeof(GlobalSnapshotXidMap),
+								   &found);
+		if (!found)
+		{
+			int i;
+
+			pg_atomic_init_u64(&gsXidMap->last_csn_seconds, 0);
+			gsXidMap->head = 0;
+			gsXidMap->size = global_snapshot_defer_time;
+			gsXidMap->xmin_by_second =
+							ShmemAlloc(sizeof(TransactionId)*gsXidMap->size);
+
+			for (i = 0; i < gsXidMap->size; i++)
+				gsXidMap->xmin_by_second[i] = InvalidTransactionId;
+		}
+	}
+}
+
+/*
+ * GlobalSnapshotStartup
+ *
+ * Set gsXidMap entries to oldestActiveXID during startup.
+ */
+void
+GlobalSnapshotStartup(TransactionId oldestActiveXID)
+{
+	/*
+	 * Run only if we have initialized shared memory and gsXidMap
+	 * is enabled.
+	 */
+	if (IsNormalProcessingMode() &&
+		track_global_snapshots && global_snapshot_defer_time > 0)
+	{
+		int i;
+
+		Assert(TransactionIdIsValid(oldestActiveXID));
+		for (i = 0; i < gsXidMap->size; i++)
+			gsXidMap->xmin_by_second[i] = oldestActiveXID;
+		ProcArraySetGlobalSnapshotXmin(oldestActiveXID);
+	}
+}
+
+/*
+ * GlobalSnapshotMapXmin
+ *
+ * Maintain circular buffer of oldestXmins for several seconds in past. This
+ * buffer allows to shift oldestXmin in the past when backend is importing
+ * global transaction. Otherwise old versions of tuples that were needed for
+ * this transaction can be recycled by other processes (vacuum, HOT, etc).
+ *
+ * Locking here is not trivial. Called upon each snapshot creation after
+ * ProcArrayLock is released. Such usage creates several race conditions. It
+ * is possible that backend who got global_csn called GlobalSnapshotMapXmin()
+ * only after other backends managed to get snapshot and complete
+ * GlobalSnapshotMapXmin() call, or even committed. This is safe because
+ *
+ *		* We already hold our xmin in MyPgXact, so our snapshot will not be
+ *		  harmed even though ProcArrayLock is released.
+ *
+ *		* snapshot_global_csn is always pessmistically rounded up to the next
+ *		  second.
+ *
+ *		* For performance reasons, xmin value for particular second is filled
+ *		  only once. Because of that instead of writing to buffer just our
+ *		  xmin (which is enough for our snapshot), we bump oldestXmin there --
+ *		  it mitigates the possibility of damaging someone else's snapshot by
+ *		  writing to the buffer too advanced value in case of slowness of
+ *		  another backend who generated csn earlier, but didn't manage to
+ *		  insert it before us.
+ *
+ *		* if GlobalSnapshotMapXmin() founds a gap in several seconds between
+ *		  current call and latest completed call then it should fill that gap
+ *		  with latest known values instead of new one. Otherwise it is
+ *		  possible (however highly unlikely) that this gap also happend
+ *		  between taking snapshot and call to GlobalSnapshotMapXmin() for some
+ *		  backend. And we are at risk to fill circullar buffer with
+ *		  oldestXmin's that are bigger then they actually were.
+ */
+void
+GlobalSnapshotMapXmin(GlobalCSN snapshot_global_csn)
+{
+	int offset, gap, i;
+	GlobalCSN csn_seconds;
+	GlobalCSN last_csn_seconds;
+	volatile TransactionId oldest_deferred_xmin;
+	TransactionId current_oldest_xmin, previous_oldest_xmin;
+
+	/* Callers should check config values */
+	Assert(global_snapshot_defer_time > 0);
+	Assert(gsXidMap != NULL);
+
+	/*
+	 * Round up global_csn to the next second -- pessimistically and safely.
+	 */
+	csn_seconds = (snapshot_global_csn / NSECS_PER_SEC + 1);
+
+	/*
+	 * Fast-path check. Avoid taking exclusive GlobalSnapshotXidMapLock lock
+	 * if oldestXid was already written to xmin_by_second[] for this rounded
+	 * global_csn.
+	 */
+	if (pg_atomic_read_u64(&gsXidMap->last_csn_seconds) >= csn_seconds)
+		return;
+
+	/* Ok, we have new entry (or entries) */
+	LWLockAcquire(GlobalSnapshotXidMapLock, LW_EXCLUSIVE);
+
+	/* Re-check last_csn_seconds under lock */
+	last_csn_seconds = pg_atomic_read_u64(&gsXidMap->last_csn_seconds);
+	if (last_csn_seconds >= csn_seconds)
+	{
+		LWLockRelease(GlobalSnapshotXidMapLock);
+		return;
+	}
+	pg_atomic_write_u64(&gsXidMap->last_csn_seconds, csn_seconds);
+
+	/*
+	 * Count oldest_xmin.
+	 *
+	 * It was possible to calculate oldest_xmin during corresponding snapshot
+	 * creation, but GetSnapshotData() intentionally reads only PgXact, but not
+	 * PgProc. And we need info about originalXmin (see comment to gsXidMap)
+	 * which is stored in PgProc because of threats in comments around PgXact
+	 * about extending it with new fields. So just calculate oldest_xmin again,
+	 * that anyway happens quite rarely.
+	 */
+	current_oldest_xmin = GetOldestXmin(NULL, PROCARRAY_NON_IMPORTED_XMIN);
+
+	previous_oldest_xmin = gsXidMap->xmin_by_second[gsXidMap->head];
+
+	Assert(TransactionIdIsNormal(current_oldest_xmin));
+	Assert(TransactionIdIsNormal(previous_oldest_xmin) || !track_global_snapshots);
+
+	gap = csn_seconds - last_csn_seconds;
+	offset = csn_seconds % gsXidMap->size;
+
+	/* Sanity check before we update head and gap */
+	Assert( gap >= 1 );
+	Assert( (gsXidMap->head + gap) % gsXidMap->size == offset );
+
+	gap = gap > gsXidMap->size ? gsXidMap->size : gap;
+	gsXidMap->head = offset;
+
+	/* Fill new entry with current_oldest_xmin */
+	gsXidMap->xmin_by_second[offset] = current_oldest_xmin;
+
+	/*
+	 * If we have gap then fill it with previous_oldest_xmin for reasons
+	 * outlined in comment above this function.
+	 */
+	for (i = 1; i < gap; i++)
+	{
+		offset = (offset + gsXidMap->size - 1) % gsXidMap->size;
+		gsXidMap->xmin_by_second[offset] = previous_oldest_xmin;
+	}
+
+	oldest_deferred_xmin =
+		gsXidMap->xmin_by_second[ (gsXidMap->head + 1) % gsXidMap->size ];
+
+	LWLockRelease(GlobalSnapshotXidMapLock);
+
+	/*
+	 * Advance procArray->global_snapshot_xmin after we released
+	 * GlobalSnapshotXidMapLock. Since we gather not xmin but oldestXmin, it
+	 * never goes backwards regardless of how slow we can do that.
+	 */
+	Assert(TransactionIdFollowsOrEquals(oldest_deferred_xmin,
+										ProcArrayGetGlobalSnapshotXmin()));
+	ProcArraySetGlobalSnapshotXmin(oldest_deferred_xmin);
+}
+
+
+/*
+ * GlobalSnapshotToXmin
+ *
+ * Get oldestXmin that took place when snapshot_global_csn was taken.
+ */
+TransactionId
+GlobalSnapshotToXmin(GlobalCSN snapshot_global_csn)
+{
+	TransactionId xmin;
+	GlobalCSN csn_seconds;
+	volatile GlobalCSN last_csn_seconds;
+
+	/* Callers should check config values */
+	Assert(global_snapshot_defer_time > 0);
+	Assert(gsXidMap != NULL);
+
+	/* Round down to get conservative estimates */
+	csn_seconds = (snapshot_global_csn / NSECS_PER_SEC);
+
+	LWLockAcquire(GlobalSnapshotXidMapLock, LW_SHARED);
+	last_csn_seconds = pg_atomic_read_u64(&gsXidMap->last_csn_seconds);
+	if (csn_seconds > last_csn_seconds)
+	{
+		/* we don't have entry for this global_csn yet, return latest known */
+		xmin = gsXidMap->xmin_by_second[gsXidMap->head];
+	}
+	else if (last_csn_seconds - csn_seconds < gsXidMap->size)
+	{
+		/* we are good, retrieve value from our map */
+		Assert(last_csn_seconds % gsXidMap->size == gsXidMap->head);
+		xmin = gsXidMap->xmin_by_second[csn_seconds % gsXidMap->size];
+	}
+	else
+	{
+		/* requested global_csn is too old, let caller know */
+		xmin = InvalidTransactionId;
+	}
+	LWLockRelease(GlobalSnapshotXidMapLock);
+
+	return xmin;
+}
+
+/*
+ * GlobalSnapshotGenerate
+ *
+ * Generate GlobalCSN which is actually a local time. Also we are forcing
+ * this time to be always increasing. Since now it is not uncommon to have
+ * millions of read transactions per second we are trying to use nanoseconds
+ * if such time resolution is available.
+ */
+GlobalCSN
+GlobalSnapshotGenerate(bool locked)
+{
+	instr_time	current_time;
+	GlobalCSN	global_csn;
+
+	Assert(track_global_snapshots || global_snapshot_defer_time > 0);
+
+	/*
+	 * TODO: create some macro that add small random shift to current time.
+	 */
+	INSTR_TIME_SET_CURRENT(current_time);
+	global_csn = (GlobalCSN) INSTR_TIME_GET_NANOSEC(current_time);
+
+	/* TODO: change to atomics? */
+	if (!locked)
+		SpinLockAcquire(&gsState->lock);
+
+	if (global_csn <= gsState->last_global_csn)
+		global_csn = ++gsState->last_global_csn;
+	else
+		gsState->last_global_csn = global_csn;
+
+	if (!locked)
+		SpinLockRelease(&gsState->lock);
+
+	return global_csn;
+}
+
+/*
+ * GlobalSnapshotSync
+ *
+ * Due to time desynchronization on different nodes we can receive global_csn
+ * which is greater than global_csn on this node. To preserve proper isolation
+ * this node needs to wait when such global_csn comes on local clock.
+ *
+ * This should happend relatively rare if nodes have running NTP/PTP/etc.
+ * Complain if wait time is more than SNAP_SYNC_COMPLAIN.
+ */
+void
+GlobalSnapshotSync(GlobalCSN remote_gcsn)
+{
+	GlobalCSN	local_gcsn;
+	GlobalCSN	delta;
+
+	Assert(track_global_snapshots);
+
+	for(;;)
+	{
+		SpinLockAcquire(&gsState->lock);
+		if (gsState->last_global_csn > remote_gcsn)
+		{
+			/* Everything is fine */
+			SpinLockRelease(&gsState->lock);
+			return;
+		}
+		else if ((local_gcsn = GlobalSnapshotGenerate(true)) >= remote_gcsn)
+		{
+			/*
+			 * Everything is fine too, but last_global_csn wasn't updated for
+			 * some time.
+			 */
+			SpinLockRelease(&gsState->lock);
+			return;
+		}
+		SpinLockRelease(&gsState->lock);
+
+		/* Okay we need to sleep now */
+		delta = remote_gcsn - local_gcsn;
+		if (delta > SNAP_DESYNC_COMPLAIN)
+			ereport(WARNING,
+				(errmsg("remote global snapshot exceeds ours by more than a second"),
+				 errhint("Consider running NTPd on servers participating in global transaction")));
+
+		/* TODO: report this sleeptime somewhere? */
+		pg_usleep((long) (delta/NSECS_PER_USEC));
+
+		/*
+		 * Loop that checks to ensure that we actually slept for specified
+		 * amount of time.
+		 */
+	}
+
+	Assert(false); /* Should not happend */
+	return;
+}
+
+/*
+ * TransactionIdGetGlobalCSN
+ *
+ * Get GlobalCSN for specified TransactionId taking care about special xids,
+ * xids beyond TransactionXmin and InDoubt states.
+ */
+GlobalCSN
+TransactionIdGetGlobalCSN(TransactionId xid)
+{
+	GlobalCSN global_csn;
+
+	Assert(track_global_snapshots);
+
+	/* Handle permanent TransactionId's for which we don't have mapping */
+	if (!TransactionIdIsNormal(xid))
+	{
+		if (xid == InvalidTransactionId)
+			return AbortedGlobalCSN;
+		if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+			return FrozenGlobalCSN;
+		Assert(false); /* Should not happend */
+	}
+
+	/*
+	 * For xids which less then TransactionXmin GlobalCSNLog can be already
+	 * trimmed but we know that such transaction is definetly not concurrently
+	 * running according to any snapshot including timetravel ones. Callers
+	 * should check TransactionDidCommit after.
+	 */
+	if (TransactionIdPrecedes(xid, TransactionXmin))
+		return FrozenGlobalCSN;
+
+	/* Read GlobalCSN from SLRU */
+	global_csn = GlobalCSNLogGetCSN(xid);
+
+	/*
+	 * If we faced InDoubt state then transaction is beeing committed and we
+	 * should wait until GlobalCSN will be assigned so that visibility check
+	 * could decide whether tuple is in snapshot. See also comments in
+	 * GlobalSnapshotPrecommit().
+	 */
+	if (GlobalCSNIsInDoubt(global_csn))
+	{
+		XactLockTableWait(xid, NULL, NULL, XLTW_None);
+		global_csn = GlobalCSNLogGetCSN(xid);
+		Assert(GlobalCSNIsNormal(global_csn) ||
+				GlobalCSNIsAborted(global_csn));
+	}
+
+	Assert(GlobalCSNIsNormal(global_csn) ||
+			GlobalCSNIsInProgress(global_csn) ||
+			GlobalCSNIsAborted(global_csn));
+
+	return global_csn;
+}
+
+/*
+ * XidInvisibleInGlobalSnapshot
+ *
+ * Version of XidInMVCCSnapshot for global transactions. For non-imported
+ * global snapshots this should give same results as XidInLocalMVCCSnapshot
+ * (except that aborts will be shown as invisible without going to clog) and to
+ * ensure such behaviour XidInMVCCSnapshot is coated with asserts that checks
+ * identicalness of XidInvisibleInGlobalSnapshot/XidInLocalMVCCSnapshot in
+ * case of ordinary snapshot.
+ */
+bool
+XidInvisibleInGlobalSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	GlobalCSN csn;
+
+	Assert(track_global_snapshots);
+
+	csn = TransactionIdGetGlobalCSN(xid);
+
+	if (GlobalCSNIsNormal(csn))
+	{
+		if (csn < snapshot->global_csn)
+			return false;
+		else
+			return true;
+	}
+	else if (GlobalCSNIsFrozen(csn))
+	{
+		/* It is bootstrap or frozen transaction */
+		return false;
+	}
+	else
+	{
+		/* It is aborted or in-progress */
+		Assert(GlobalCSNIsAborted(csn) || GlobalCSNIsInProgress(csn));
+		if (GlobalCSNIsAborted(csn))
+			Assert(TransactionIdDidAbort(xid));
+		return true;
+	}
+}
+
+
+/*****************************************************************************
+ * Functions to handle distributed commit on transaction coordinator:
+ * GlobalSnapshotPrepareCurrent() / GlobalSnapshotAssignCsnCurrent().
+ * Correspoding functions for remote nodes are defined in twophase.c:
+ * pg_global_snapshot_prepare/pg_global_snapshot_assign.
+ *****************************************************************************/
+
+
+/*
+ * GlobalSnapshotPrepareCurrent
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+GlobalCSN
+GlobalSnapshotPrepareCurrent()
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (TransactionIdIsValid(xid))
+	{
+		TransactionId *subxids;
+		int nsubxids = xactGetCommittedChildren(&subxids);
+		GlobalCSNLogSetCSN(xid, nsubxids,
+									subxids, InDoubtGlobalCSN);
+	}
+
+	/* Nothing to write if we don't have xid */
+
+	return GlobalSnapshotGenerate(false);
+}
+
+/*
+ * GlobalSnapshotAssignCsnCurrent
+ *
+ * Asign GlobalCSN for currently active transaction. GlobalCSN is supposedly
+ * maximal among of values returned by GlobalSnapshotPrepareCurrent and
+ * pg_global_snapshot_prepare.
+ */
+void
+GlobalSnapshotAssignCsnCurrent(GlobalCSN global_csn)
+{
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (!GlobalCSNIsNormal(global_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_global_snapshot_assign expects normal global_csn")));
+
+	/* Skip emtpty transactions */
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		return;
+
+	/* Set global_csn and defuse ProcArrayEndTransaction from assigning one */
+	pg_atomic_write_u64(&MyProc->assignedGlobalCsn, global_csn);
+}
+
+
+/*****************************************************************************
+ * Functions to handle global and local transactions commit.
+ *
+ * For local transactions GlobalSnapshotPrecommit sets InDoubt state before
+ * ProcArrayEndTransaction is called and transaction data potetntially becomes
+ * visible to other backends. ProcArrayEndTransaction (or ProcArrayRemove in
+ * twophase case) then acquires global_csn under ProcArray lock and stores it
+ * in proc->assignedGlobalCsn. It's important that global_csn for commit is
+ * generated under ProcArray lock, otherwise global and local snapshots won't
+ * be equivalent. Consequent call to GlobalSnapshotCommit will write
+ * proc->assignedGlobalCsn to GlobalCSNLog.
+ *
+ * Same rules applies to global transaction, except that global_csn is already
+ * assigned by GlobalSnapshotAssignCsnCurrent/pg_global_snapshot_assign and
+ * GlobalSnapshotPrecommit is basically no-op.
+ *
+ * GlobalSnapshotAbort is slightly different comparing to commit because abort
+ * can skip InDoubt phase and can be called for transaction subtree.
+ *****************************************************************************/
+
+
+/*
+ * GlobalSnapshotAbort
+ *
+ * Abort transaction in GlobalCsnLog. We can skip InDoubt state for aborts
+ * since no concurrent transactions allowed to see aborted data anyway.
+ */
+void
+GlobalSnapshotAbort(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	if (!track_global_snapshots)
+		return;
+
+	GlobalCSNLogSetCSN(xid, nsubxids, subxids, AbortedGlobalCSN);
+
+	/*
+	 * Clean assignedGlobalCsn anyway, as it was possibly set in
+	 * GlobalSnapshotAssignCsnCurrent.
+	 */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, InProgressGlobalCSN);
+}
+
+/*
+ * GlobalSnapshotPrecommit
+ *
+ * Set InDoubt status for local transaction that we are going to commit.
+ * This step is needed to achieve consistency between local snapshots and
+ * global csn-based snapshots. We don't hold ProcArray lock while writing
+ * csn for transaction in SLRU but instead we set InDoubt status before
+ * transaction is deleted from ProcArray so the readers who will read csn
+ * in the gap between ProcArray removal and GlobalCSN assignment can wait
+ * until GlobalCSN is finally assigned. See also TransactionIdGetGlobalCSN().
+ *
+ * For global transaction this does nothing as InDoubt state was written
+ * earlier.
+ *
+ * This should be called only from parallel group leader before backend is
+ * deleted from ProcArray.
+ */
+void
+GlobalSnapshotPrecommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	GlobalCSN oldAssignedGlobalCsn = InProgressGlobalCSN;
+	bool in_progress;
+
+	if (!track_global_snapshots)
+		return;
+
+	/* Set InDoubt status if it is local transaction */
+	in_progress = pg_atomic_compare_exchange_u64(&proc->assignedGlobalCsn,
+												 &oldAssignedGlobalCsn,
+												 InDoubtGlobalCSN);
+	if (in_progress)
+	{
+		Assert(GlobalCSNIsInProgress(oldAssignedGlobalCsn));
+		GlobalCSNLogSetCSN(xid, nsubxids,
+						   subxids, InDoubtGlobalCSN);
+	}
+	else
+	{
+		/* Otherwise we should have valid GlobalCSN by this time */
+		Assert(GlobalCSNIsNormal(oldAssignedGlobalCsn));
+		/* Also global transaction should already be in InDoubt state */
+		Assert(GlobalCSNIsInDoubt(GlobalCSNLogGetCSN(xid)));
+	}
+}
+
+/*
+ * GlobalSnapshotCommit
+ *
+ * Write GlobalCSN that were acquired earlier to GlobalCsnLog. Should be
+ * preceded by GlobalSnapshotPrecommit() so readers can wait until we finally
+ * finished writing to SLRU.
+ *
+ * Should be called after ProcArrayEndTransaction, but before releasing
+ * transaction locks, so that TransactionIdGetGlobalCSN can wait on this
+ * lock for GlobalCSN.
+ */
+void
+GlobalSnapshotCommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	volatile GlobalCSN assigned_global_csn;
+
+	if (!track_global_snapshots)
+		return;
+
+	if (!TransactionIdIsValid(xid))
+	{
+		assigned_global_csn = pg_atomic_read_u64(&proc->assignedGlobalCsn);
+		Assert(GlobalCSNIsInProgress(assigned_global_csn));
+		return;
+	}
+
+	/* Finally write resulting GlobalCSN in SLRU */
+	assigned_global_csn = pg_atomic_read_u64(&proc->assignedGlobalCsn);
+	Assert(GlobalCSNIsNormal(assigned_global_csn));
+	GlobalCSNLogSetCSN(xid, nsubxids,
+						   subxids, assigned_global_csn);
+
+	/* Reset for next transaction */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, InProgressGlobalCSN);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 9a69fc1e09..c89d1005c6 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_snapshot.h"
 #include "access/global_csn_log.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
@@ -1480,8 +1481,34 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nabortrels, abortrels,
 									   gid);
 
+	/*
+	 * GlobalSnapshot callbacks that should be called right before we are
+	 * going to become visible. Details in comments to this functions.
+	 */
+	if (isCommit)
+		GlobalSnapshotPrecommit(proc, xid, hdr->nsubxacts, children);
+	else
+		GlobalSnapshotAbort(proc, xid, hdr->nsubxacts, children);
+
+
 	ProcArrayRemove(proc, latestXid);
 
+	/*
+	 * Stamp our transaction with GlobalCSN in GlobalCsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks, since TransactionIdGetGlobalCSN relies on
+	 * XactLockTableWait to await global_csn.
+	 */
+	if (isCommit)
+	{
+		GlobalSnapshotCommit(proc, xid, hdr->nsubxacts, children);
+	}
+	else
+	{
+		Assert(GlobalCSNIsInProgress(
+				   pg_atomic_read_u64(&proc->assignedGlobalCsn)));
+	}
+
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
 	 * no one else will try to commit/rollback, and so it will be recycled if
@@ -2442,3 +2469,132 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 		RemoveTwoPhaseFile(xid, giveWarning);
 	RemoveGXact(gxact);
 }
+
+/*
+ * GlobalSnapshotPrepareTwophase
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ *
+ * This function is a counterpart of GlobalSnapshotPrepareCurrent() for
+ * twophase transactions.
+ */
+static GlobalCSN
+GlobalSnapshotPrepareTwophase(const char *gid)
+{
+	GlobalTransaction gxact;
+	PGXACT	   *pgxact;
+	char	   *buf;
+	TransactionId xid;
+	xl_xact_parsed_prepare parsed;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
+	xid = pgxact->xid;
+
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, true);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+	ParsePrepareRecord(0, (xl_xact_prepare *) buf, &parsed);
+
+	GlobalCSNLogSetCSN(xid, parsed.nsubxacts,
+					parsed.subxacts, InDoubtGlobalCSN);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	pfree(buf);
+
+	return GlobalSnapshotGenerate(false);
+}
+
+/*
+ * SQL interface to GlobalSnapshotPrepareTwophase()
+ *
+ * TODO: Rewrite this as PREPARE TRANSACTION 'gid' RETURNING SNAPSHOT
+ */
+Datum
+pg_global_snapshot_prepare(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	GlobalCSN	global_csn;
+
+	global_csn = GlobalSnapshotPrepareTwophase(gid);
+
+	PG_RETURN_INT64(global_csn);
+}
+
+
+/*
+ * TwoPhaseAssignGlobalCsn
+ *
+ * Asign GlobalCSN for currently active transaction. GlobalCSN is supposedly
+ * maximal among of values returned by GlobalSnapshotPrepareCurrent and
+ * pg_global_snapshot_prepare.
+ *
+ * This function is a counterpart of GlobalSnapshotAssignCsnCurrent() for
+ * twophase transactions.
+ */
+static void
+GlobalSnapshotAssignCsnTwoPhase(const char *gid, GlobalCSN global_csn)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"track_global_snapshots")));
+
+	if (!GlobalCSNIsNormal(global_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_global_snapshot_assign expects normal global_csn")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	/* Set global_csn and defuse ProcArrayRemove from assigning one. */
+	pg_atomic_write_u64(&proc->assignedGlobalCsn, global_csn);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * SQL interface to GlobalSnapshotAssignCsnTwoPhase()
+ *
+ * TODO: Rewrite this as COMMIT PREPARED 'gid' SNAPSHOT 'global_csn'
+ */
+Datum
+pg_global_snapshot_assign(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	GlobalCSN	global_csn = PG_GETARG_INT64(1);
+
+	GlobalSnapshotAssignCsnTwoPhase(gid, global_csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index cd30b62d36..042239ec0e 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/global_snapshot.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -1433,6 +1434,14 @@ RecordTransactionCommit(void)
 
 	/* Reset XactLastRecEnd until the next transaction writes something */
 	XactLastRecEnd = 0;
+
+	/*
+	 * Mark our transaction as InDoubt in GlobalCsnLog and get ready for
+	 * commit.
+	 */
+	if (markXidCommitted)
+		GlobalSnapshotPrecommit(MyProc, xid, nchildren, children);
+
 cleanup:
 	/* Clean up local data */
 	if (rels)
@@ -1694,6 +1703,11 @@ RecordTransactionAbort(bool isSubXact)
 	 */
 	TransactionIdAbortTree(xid, nchildren, children);
 
+	/*
+	 * Mark our transaction as Aborted in GlobalCsnLog.
+	 */
+	GlobalSnapshotAbort(MyProc, xid, nchildren, children);
+
 	END_CRIT_SECTION();
 
 	/* Compute latestXid while we have the child XIDs handy */
@@ -2183,6 +2197,21 @@ CommitTransaction(void)
 	 */
 	ProcArrayEndTransaction(MyProc, latestXid);
 
+	/*
+	 * Stamp our transaction with GlobalCSN in GlobalCsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks.
+	 */
+	if (!is_parallel_worker)
+	{
+		TransactionId  xid = GetTopTransactionIdIfAny();
+		TransactionId *subxids;
+		int			   nsubxids;
+
+		nsubxids = xactGetCommittedChildren(&subxids);
+		GlobalSnapshotCommit(MyProc, xid, nsubxids, subxids);
+	}
+
 	/*
 	 * This is all post-commit cleanup.  Note that if an error is raised here,
 	 * it's too late to abort the transaction.  This should be just
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4ffe4aad03..aa91526468 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7063,6 +7063,7 @@ StartupXLOG(void)
 			StartupCLOG();
 			StartupGlobalCSNLog(oldestActiveXID);
 			StartupSUBTRANS(oldestActiveXID);
+			GlobalSnapshotStartup(oldestActiveXID);
 
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
@@ -7881,6 +7882,7 @@ StartupXLOG(void)
 		StartupCLOG();
 		StartupGlobalCSNLog(oldestActiveXID);
 		StartupSUBTRANS(oldestActiveXID);
+		GlobalSnapshotStartup(oldestActiveXID);
 	}
 
 	/*
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index dc2d2959c4..d1819dc2c8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -22,6 +22,7 @@
 #include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
+#include "access/global_snapshot.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -145,6 +146,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
 		size = add_size(size, ApplyLauncherShmemSize());
+		size = add_size(size, GlobalSnapshotShmemSize());
 		size = add_size(size, SnapMgrShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
@@ -266,6 +268,7 @@ CreateSharedMemoryAndSemaphores(void)
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	AsyncShmemInit();
+	GlobalSnapshotShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 486da77f68..90c0e90b46 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -47,6 +47,7 @@
 
 #include "access/clog.h"
 #include "access/global_csn_log.h"
+#include "access/global_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -95,6 +96,8 @@ typedef struct ProcArrayStruct
 	TransactionId replication_slot_xmin;
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
+	/* xmin of oldest active global snapshot */
+	TransactionId global_snapshot_xmin;
 
 	/* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
@@ -250,6 +253,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		procArray->global_snapshot_xmin = InvalidTransactionId;
 	}
 
 	allProcs = ProcGlobal->allProcs;
@@ -353,6 +357,17 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 								  latestXid))
 			ShmemVariableCache->latestCompletedXid = latestXid;
+
+		/*
+		 * Assign global csn while holding ProcArrayLock for non-global
+		 * COMMIT PREPARED. After lock is released consequent
+		 * GlobalSnapshotCommit() will write this value to GlobalCsnLog.
+		 *
+		 * In case of global commit proc->assignedGlobalCsn is already set
+		 * by prior AssignGlobalCsn().
+		 */
+		if (GlobalCSNIsInDoubt(pg_atomic_read_u64(&proc->assignedGlobalCsn)))
+			pg_atomic_write_u64(&proc->assignedGlobalCsn, GlobalSnapshotGenerate(false));
 	}
 	else
 	{
@@ -433,6 +448,8 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 
 		proc->lxid = InvalidLocalTransactionId;
 		pgxact->xmin = InvalidTransactionId;
+		proc->originalXmin = InvalidTransactionId;
+
 		/* must be cleared with xid/xmin: */
 		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
@@ -455,6 +472,8 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	pgxact->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
+
 	/* must be cleared with xid/xmin: */
 	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
@@ -468,6 +487,20 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 							  latestXid))
 		ShmemVariableCache->latestCompletedXid = latestXid;
+
+	/*
+	 * Assign global csn while holding ProcArrayLock for non-global
+	 * COMMIT. After lock is released consequent GlobalSnapshotFinish() will
+	 * write this value to GlobalCsnLog.
+	 *
+	 * In case of global commit MyProc->assignedGlobalCsn is already set
+	 * by prior AssignGlobalCsn().
+	 *
+	 * TODO: in case of group commit we can generate one GlobalSnapshot for
+	 * whole group to save time on timestamp aquisition.
+	 */
+	if (GlobalCSNIsInDoubt(pg_atomic_read_u64(&proc->assignedGlobalCsn)))
+		pg_atomic_write_u64(&proc->assignedGlobalCsn, GlobalSnapshotGenerate(false));
 }
 
 /*
@@ -611,6 +644,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	pgxact->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
 
 	/* redundant, but just in case */
@@ -1313,6 +1347,7 @@ GetOldestXmin(Relation rel, int flags)
 
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+	TransactionId global_snapshot_xmin = InvalidTransactionId;
 
 	/*
 	 * If we're not computing a relation specific limit, or if a shared
@@ -1349,8 +1384,9 @@ GetOldestXmin(Relation rel, int flags)
 			proc->databaseId == MyDatabaseId ||
 			proc->databaseId == 0)	/* always include WalSender */
 		{
-			/* Fetch xid just once - see GetNewTransactionId */
+			/* Fetch both xids just once - see GetNewTransactionId */
 			TransactionId xid = UINT32_ACCESS_ONCE(pgxact->xid);
+			TransactionId original_xmin = UINT32_ACCESS_ONCE(proc->originalXmin);
 
 			/* First consider the transaction's own Xid, if any */
 			if (TransactionIdIsNormal(xid) &&
@@ -1363,8 +1399,17 @@ GetOldestXmin(Relation rel, int flags)
 			 * We must check both Xid and Xmin because a transaction might
 			 * have an Xmin but not (yet) an Xid; conversely, if it has an
 			 * Xid, that could determine some not-yet-set Xmin.
+			 *
+			 * In case of oldestXmin calculation for GlobalSnapshotMapXmin()
+			 * pgxact->xmin should be changed to proc->originalXmin. Details
+			 * in commets to GlobalSnapshotMapXmin.
 			 */
-			xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+			if ((flags & PROCARRAY_NON_IMPORTED_XMIN) &&
+					TransactionIdIsValid(original_xmin))
+				xid = original_xmin;
+			else
+				xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+
 			if (TransactionIdIsNormal(xid) &&
 				TransactionIdPrecedes(xid, result))
 				result = xid;
@@ -1378,6 +1423,7 @@ GetOldestXmin(Relation rel, int flags)
 	 */
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	global_snapshot_xmin = ProcArrayGetGlobalSnapshotXmin();
 
 	if (RecoveryInProgress())
 	{
@@ -1419,6 +1465,11 @@ GetOldestXmin(Relation rel, int flags)
 			result = FirstNormalTransactionId;
 	}
 
+	if (!(flags & PROCARRAY_NON_IMPORTED_XMIN) &&
+		TransactionIdIsValid(global_snapshot_xmin) &&
+		NormalTransactionIdPrecedes(global_snapshot_xmin, result))
+		result = global_snapshot_xmin;
+
 	/*
 	 * Check whether there are replication slots requiring an older xmin.
 	 */
@@ -1513,8 +1564,10 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	GlobalCSN	global_csn = FrozenGlobalCSN;
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+	TransactionId global_snapshot_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -1706,10 +1759,18 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	global_snapshot_xmin = ProcArrayGetGlobalSnapshotXmin();
 
 	if (!TransactionIdIsValid(MyPgXact->xmin))
 		MyPgXact->xmin = TransactionXmin = xmin;
 
+	/*
+	 * Take GlobalCSN under ProcArrayLock so the local/global snapshot stays
+	 * synchronized.
+	 */
+	if (track_global_snapshots)
+		global_csn = GlobalSnapshotGenerate(false);
+
 	LWLockRelease(ProcArrayLock);
 
 	/*
@@ -1725,6 +1786,10 @@ GetSnapshotData(Snapshot snapshot)
 	if (!TransactionIdIsNormal(RecentGlobalXmin))
 		RecentGlobalXmin = FirstNormalTransactionId;
 
+	if (/*track_global_snapshots && */TransactionIdIsValid(global_snapshot_xmin) &&
+		TransactionIdPrecedes(global_snapshot_xmin, RecentGlobalXmin))
+		RecentGlobalXmin = global_snapshot_xmin;
+
 	/* Check whether there's a replication slot requiring an older xmin. */
 	if (TransactionIdIsValid(replication_slot_xmin) &&
 		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
@@ -1780,6 +1845,11 @@ GetSnapshotData(Snapshot snapshot)
 		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin);
 	}
 
+	snapshot->imported_global_csn = false;
+	snapshot->global_csn = global_csn;
+	if (global_snapshot_defer_time > 0 && IsUnderPostmaster)
+		GlobalSnapshotMapXmin(snapshot->global_csn);
+
 	return snapshot;
 }
 
@@ -3127,6 +3197,24 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
+/*
+ * ProcArraySetGlobalSnapshotXmin
+ */
+void
+ProcArraySetGlobalSnapshotXmin(TransactionId xmin)
+{
+	/* We rely on atomic fetch/store of xid */
+	procArray->global_snapshot_xmin = xmin;
+}
+
+/*
+ * ProcArrayGetGlobalSnapshotXmin
+ */
+TransactionId
+ProcArrayGetGlobalSnapshotXmin(void)
+{
+	return procArray->global_snapshot_xmin;
+}
 
 #define XidCacheRemove(i) \
 	do { \
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index aa904b1f17..45d5b8e6ed 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -51,3 +51,4 @@ OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 XactTruncationLock					44
 GlobalCSNLogControlLock				45
+GlobalSnapshotXidMapLock			46
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index f5eef6fa4e..cad38c18a6 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -37,6 +37,7 @@
 
 #include "access/transam.h"
 #include "access/twophase.h"
+#include "access/global_snapshot.h"
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -441,6 +442,9 @@ InitProcess(void)
 	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
 	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
 
+	MyProc->originalXmin = InvalidTransactionId;
+	pg_atomic_init_u64(&MyProc->assignedGlobalCsn, InProgressGlobalCSN);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
@@ -584,6 +588,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
 	MyProc->waitProcLock = NULL;
+	MyProc->originalXmin = InvalidTransactionId;
 #ifdef USE_ASSERT_CHECKING
 	{
 		int			i;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 4910e4fc66..79d7123f9f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,6 +28,7 @@
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
+#include "access/global_snapshot.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
 #include "access/transam.h"
@@ -1178,7 +1179,7 @@ static struct config_bool ConfigureNamesBool[] =
 			gettext_noop("Used to achieve REPEATEBLE READ isolation level for postgres_fdw transactions.")
 		},
 		&track_global_snapshots,
-		true, /* XXX: set true to simplify tesing. XXX2: Seems that RESOURCES_MEM isn't the best catagory */
+		false, /* XXX: Seems that RESOURCES_MEM isn't the best catagory */
 		NULL, NULL, NULL
 	},
 	{
@@ -2467,6 +2468,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"global_snapshot_defer_time", PGC_POSTMASTER, REPLICATION_MASTER,
+			gettext_noop("Minimal age of records which allowed to be vacuumed, in seconds."),
+			NULL
+		},
+		&global_snapshot_defer_time,
+		5, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * See also CheckRequiredParameterValues() if this parameter changes
 	 */
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac02bd0c00..cbd6de119a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -306,6 +306,8 @@
 				# and comma-separated list of application_name
 				# from standby(s); '*' = all
 #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+#global_snapshot_defer_time = 0	# minimal age of records which allowed to be
+				# vacuumed, in seconds
 
 # - Standby Servers -
 
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 1c063c592c..3d925a7866 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/global_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -247,6 +248,8 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	GlobalCSN	global_csn;
+	bool		imported_global_csn;
 } SerializedSnapshotData;
 
 Size
@@ -1024,7 +1027,9 @@ SnapshotResetXmin(void)
 										pairingheap_first(&RegisteredSnapshots));
 
 	if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin))
+	{
 		MyPgXact->xmin = minSnapshot->xmin;
+	}
 }
 
 /*
@@ -2115,6 +2120,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.global_csn = snapshot->global_csn;
+	serialized_snapshot.imported_global_csn = snapshot->imported_global_csn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -2189,6 +2196,8 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->global_csn = serialized_snapshot.global_csn;
+	snapshot->imported_global_csn = serialized_snapshot.imported_global_csn;
 
 	/* Copy XIDs, if present. */
 	if (serialized_snapshot.xcnt > 0)
@@ -2228,8 +2237,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
 }
 
 /*
- * XidInMVCCSnapshot
- *		Is the given XID still-in-progress according to the snapshot?
+ * XidInLocalMVCCSnapshot
+ *		Is the given XID still-in-progress according to the local snapshot?
  *
  * Note: GetSnapshotData never stores either top xid or subxids of our own
  * backend into a snapshot, so these xids will not be reported as "running"
@@ -2237,8 +2246,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
  * TransactionIdIsCurrentTransactionId first, except when it's known the
  * XID could not be ours anyway.
  */
-bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+static bool
+XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
 	uint32		i;
 
@@ -2348,3 +2357,153 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+/*
+ * XidInMVCCSnapshot
+ *
+ * Check whether this xid is in snapshot, taking into account fact that
+ * snapshot can be global. When track_global_snapshots is switched off
+ * just call XidInLocalMVCCSnapshot().
+ */
+bool
+XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	bool in_snapshot;
+
+	if (snapshot->imported_global_csn)
+	{
+		Assert(track_global_snapshots);
+		/* No point to using snapshot info except CSN */
+		return XidInvisibleInGlobalSnapshot(xid, snapshot);
+	}
+
+	in_snapshot = XidInLocalMVCCSnapshot(xid, snapshot);
+
+	if (!track_global_snapshots)
+	{
+		Assert(GlobalCSNIsFrozen(snapshot->global_csn));
+		return in_snapshot;
+	}
+
+	if (in_snapshot)
+	{
+		/*
+		 * This xid may be already in unknown state and in that case
+		 * we must wait and recheck.
+		 *
+		 * TODO: this check can be skipped if we know for sure that there were
+		 * no global transactions when this snapshot was taken. That requires
+		 * some changes to mechanisms of global snapshots exprot/import (if
+		 * backend set xmin then we should have a-priori knowledge that this
+		 * transaction going to be global or local -- right now this is not
+		 * enforced). Leave that for future and don't complicate this patch.
+		 */
+		return XidInvisibleInGlobalSnapshot(xid, snapshot);
+	}
+	else
+	{
+#ifdef USE_ASSERT_CHECKING
+		/* Check that global snapshot gives the same results as local one */
+		if (XidInvisibleInGlobalSnapshot(xid, snapshot))
+		{
+			GlobalCSN gcsn = TransactionIdGetGlobalCSN(xid);
+			Assert(GlobalCSNIsAborted(gcsn));
+		}
+#endif
+		return false;
+	}
+}
+
+/*
+ * ExportGlobalSnapshot
+ *
+ * Export global_csn so that caller can expand this transaction to other
+ * nodes.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+GlobalCSN
+ExportGlobalSnapshot()
+{
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not export global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "track_global_snapshots")));
+
+	return CurrentSnapshot->global_csn;
+}
+
+/* SQL accessor to ExportGlobalSnapshot() */
+Datum
+pg_global_snapshot_export(PG_FUNCTION_ARGS)
+{
+	GlobalCSN	global_csn = ExportGlobalSnapshot();
+	PG_RETURN_UINT64(global_csn);
+}
+
+/*
+ * ImportGlobalSnapshot
+ *
+ * Import global_csn and retract this backends xmin to the value that was
+ * actual when we had such global_csn.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+void
+ImportGlobalSnapshot(GlobalCSN snap_global_csn)
+{
+	volatile TransactionId xmin;
+
+	if (!track_global_snapshots)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "track_global_snapshots")));
+
+	if (global_snapshot_defer_time <= 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import global snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is positive.",
+					 "global_snapshot_defer_time")));
+
+	/*
+	 * Call GlobalSnapshotToXmin under ProcArrayLock to avoid situation that
+	 * resulting xmin will be evicted from map before we will set it into our
+	 * backend's xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	xmin = GlobalSnapshotToXmin(snap_global_csn);
+	if (!TransactionIdIsValid(xmin))
+	{
+		LWLockRelease(ProcArrayLock);
+		elog(ERROR, "GlobalSnapshotToXmin: global snapshot too old");
+	}
+	MyProc->originalXmin = MyPgXact->xmin;
+	MyPgXact->xmin = TransactionXmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	CurrentSnapshot->xmin = xmin; /* defuse SnapshotResetXmin() */
+	CurrentSnapshot->global_csn = snap_global_csn;
+	CurrentSnapshot->imported_global_csn = true;
+	GlobalSnapshotSync(snap_global_csn);
+
+	Assert(TransactionIdPrecedesOrEquals(RecentGlobalXmin, xmin));
+	Assert(TransactionIdPrecedesOrEquals(RecentGlobalDataXmin, xmin));
+}
+
+/* SQL accessor to ImportGlobalSnapshot() */
+Datum
+pg_global_snapshot_import(PG_FUNCTION_ARGS)
+{
+	GlobalCSN	global_csn = PG_GETARG_UINT64(0);
+	ImportGlobalSnapshot(global_csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/include/access/global_snapshot.h b/src/include/access/global_snapshot.h
new file mode 100644
index 0000000000..246b180cfd
--- /dev/null
+++ b/src/include/access/global_snapshot.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * global_snapshot.h
+ *	  Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/global_snapshot.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GLOBAL_SNAPSHOT_H
+#define GLOBAL_SNAPSHOT_H
+
+#include "port/atomics.h"
+#include "storage/lock.h"
+#include "utils/snapshot.h"
+#include "utils/guc.h"
+
+/*
+ * snapshot.h is used in frontend code so atomic variant of GlobalCSN type
+ * is defined here.
+ */
+typedef pg_atomic_uint64 GlobalCSN_atomic;
+
+#define InProgressGlobalCSN	 UINT64CONST(0x0)
+#define AbortedGlobalCSN	 UINT64CONST(0x1)
+#define FrozenGlobalCSN		 UINT64CONST(0x2)
+#define InDoubtGlobalCSN	 UINT64CONST(0x3)
+#define FirstNormalGlobalCSN UINT64CONST(0x4)
+
+#define GlobalCSNIsInProgress(csn)	((csn) == InProgressGlobalCSN)
+#define GlobalCSNIsAborted(csn)		((csn) == AbortedGlobalCSN)
+#define GlobalCSNIsFrozen(csn)		((csn) == FrozenGlobalCSN)
+#define GlobalCSNIsInDoubt(csn)		((csn) == InDoubtGlobalCSN)
+#define GlobalCSNIsNormal(csn)		((csn) >= FirstNormalGlobalCSN)
+
+
+extern int global_snapshot_defer_time;
+
+
+extern Size GlobalSnapshotShmemSize(void);
+extern void GlobalSnapshotShmemInit(void);
+extern void GlobalSnapshotStartup(TransactionId oldestActiveXID);
+
+extern void GlobalSnapshotMapXmin(GlobalCSN snapshot_global_csn);
+extern TransactionId GlobalSnapshotToXmin(GlobalCSN snapshot_global_csn);
+
+extern GlobalCSN GlobalSnapshotGenerate(bool locked);
+
+extern bool XidInvisibleInGlobalSnapshot(TransactionId xid, Snapshot snapshot);
+
+extern void GlobalSnapshotSync(GlobalCSN remote_gcsn);
+
+extern GlobalCSN TransactionIdGetGlobalCSN(TransactionId xid);
+
+extern GlobalCSN GlobalSnapshotPrepareGlobal(const char *gid);
+extern void GlobalSnapshotAssignCsnGlobal(const char *gid,
+										  GlobalCSN global_csn);
+
+extern GlobalCSN GlobalSnapshotPrepareCurrent(void);
+extern void GlobalSnapshotAssignCsnCurrent(GlobalCSN global_csn);
+
+extern void GlobalSnapshotAbort(PGPROC *proc, TransactionId xid, int nsubxids,
+								TransactionId *subxids);
+extern void GlobalSnapshotPrecommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void GlobalSnapshotCommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+
+#endif							/* GLOBAL_SNAPSHOT_H */
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 2ca71c3445..b4899f3754 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -18,6 +18,7 @@
 #include "access/xlogdefs.h"
 #include "datatype/timestamp.h"
 #include "storage/lock.h"
+#include "utils/snapshot.h"
 
 /*
  * GlobalTransactionData is defined in twophase.c; other places have no
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 61f2c2f5b4..c76da68a0a 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10936,4 +10936,17 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+# global transaction handling
+{ oid => '4388', descr => 'export global transaction snapshot',
+  proname => 'pg_global_snapshot_export', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_global_snapshot_export' },
+{ oid => '4389', descr => 'import global transaction snapshot',
+  proname => 'pg_global_snapshot_import', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'int8', prosrc => 'pg_global_snapshot_import' },
+{ oid => '4390', descr => 'prepare distributed transaction for commit, get global_csn',
+  proname => 'pg_global_snapshot_prepare', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => 'text', prosrc => 'pg_global_snapshot_prepare' },
+{ oid => '4391', descr => 'assign global_csn to distributed transaction',
+  proname => 'pg_global_snapshot_assign', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_global_snapshot_assign' },
 ]
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 6be6d35d1e..583b1beea5 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -93,6 +93,9 @@ typedef struct
 #define USECS_PER_MINUTE INT64CONST(60000000)
 #define USECS_PER_SEC	INT64CONST(1000000)
 
+#define NSECS_PER_SEC	INT64CONST(1000000000)
+#define NSECS_PER_USEC	INT64CONST(1000)
+
 /*
  * We allow numeric timezone offsets up to 15:59:59 either way from Greenwich.
  * Currently, the record holders for wackiest offsets in actual use are zones
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index d349510b7c..5cdf2e17cb 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -280,6 +280,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_GETARG_FLOAT4(n)  DatumGetFloat4(PG_GETARG_DATUM(n))
 #define PG_GETARG_FLOAT8(n)  DatumGetFloat8(PG_GETARG_DATUM(n))
 #define PG_GETARG_INT64(n)	 DatumGetInt64(PG_GETARG_DATUM(n))
+#define PG_GETARG_UINT64(n)	 DatumGetUInt64(PG_GETARG_DATUM(n))
 /* use this if you want the raw, possibly-toasted input datum: */
 #define PG_GETARG_RAW_VARLENA_P(n)	((struct varlena *) PG_GETARG_POINTER(n))
 /* use this if you want the input datum de-toasted: */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index d6459327cc..4ac23da654 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,6 +141,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* !HAVE_CLOCK_GETTIME */
 
 /* Use gettimeofday() */
@@ -205,6 +208,10 @@ typedef struct timeval instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + \
+		(uint64) (t).tv_usec * (uint64) 1000)
+
 #endif							/* HAVE_CLOCK_GETTIME */
 
 #else							/* WIN32 */
@@ -237,6 +244,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 1ee9000b2b..aeaeb021ef 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -15,8 +15,10 @@
 #define _PROC_H_
 
 #include "access/clog.h"
+#include "access/global_snapshot.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
+#include "utils/snapshot.h"
 #include "storage/latch.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
@@ -57,6 +59,7 @@ struct XidCache
 #define		PROC_IN_LOGICAL_DECODING	0x10	/* currently doing logical
 												 * decoding outside xact */
 #define		PROC_RESERVED				0x20	/* reserved for procarray */
+#define		PROC_RESERVED2				0x40	/* reserved for procarray */
 
 /* flags reset at EOXact */
 #define		PROC_VACUUM_STATE_MASK \
@@ -203,6 +206,18 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/*
+	 * assignedGlobalCsn holds GlobalCSN for this transaction.  It is generated
+	 * under a ProcArray lock and later is writter to a GlobalCSNLog.  This
+	 * variable defined as atomic only for case of group commit, in all other
+	 * scenarios only backend responsible for this proc entry is working with
+	 * this variable.
+	 */
+	GlobalCSN_atomic assignedGlobalCsn;
+
+	/* Original xmin of this backend before global snapshot was imported */
+	TransactionId originalXmin;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index a5c7d0c064..452ae5d547 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -36,6 +36,10 @@
 
 #define		PROCARRAY_SLOTS_XMIN			0x20	/* replication slot xmin,
 													 * catalog_xmin */
+
+#define		PROCARRAY_NON_IMPORTED_XMIN		0x40	/* use originalXmin instead
+													 * of xmin to properly
+													 * maintain gsXidMap */
 /*
  * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
  * PGXACT->vacuumFlags. Other flags are used for different purposes and
@@ -125,4 +129,8 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
 
+extern void ProcArraySetGlobalSnapshotXmin(TransactionId xmin);
+
+extern TransactionId ProcArrayGetGlobalSnapshotXmin(void);
+
 #endif							/* PROCARRAY_H */
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index b28d13ce84..f4768bc6d4 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -127,6 +127,9 @@ extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
 
+extern GlobalCSN ExportGlobalSnapshot(void);
+extern void ImportGlobalSnapshot(GlobalCSN snap_global_csn);
+
 extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 57d2dfaa67..71c92c69f4 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -204,6 +204,14 @@ typedef struct SnapshotData
 
 	TimestampTz whenTaken;		/* timestamp when snapshot was taken */
 	XLogRecPtr	lsn;			/* position in the WAL stream when taken */
+
+	/*
+	 * GlobalCSN for cross-node snapshot isolation support.
+	 * Will be used only if track_global_snapshots is enabled.
+	 */
+	GlobalCSN	global_csn;
+	/* Did we have our own global_csn or imported one from different node */
+	bool		imported_global_csn;
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */
-- 
2.25.1

0003-postgres_fdw-support-for-global-snapshots.patchtext/x-patch; charset=UTF-8; name=0003-postgres_fdw-support-for-global-snapshots.patchDownload

From 64a39dc351dfd3d624ceca4332b70df3f819bad4 Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Tue, 9 Jun 2020 15:04:44 +0500
Subject: [PATCH 3/3] postgres_fdw-support-for-global-snapshots

---
 contrib/postgres_fdw/Makefile                 |   9 +
 contrib/postgres_fdw/connection.c             | 290 ++++++++++++++++--
 contrib/postgres_fdw/postgres_fdw.c           |  12 +
 contrib/postgres_fdw/postgres_fdw.h           |   2 +
 .../postgres_fdw/t/001_bank_coordinator.pl    | 264 ++++++++++++++++
 .../postgres_fdw/t/002_bank_participant.pl    | 240 +++++++++++++++
 src/test/perl/PostgresNode.pm                 |  35 +++
 7 files changed, 826 insertions(+), 26 deletions(-)
 create mode 100644 contrib/postgres_fdw/t/001_bank_coordinator.pl
 create mode 100644 contrib/postgres_fdw/t/002_bank_participant.pl

diff --git a/contrib/postgres_fdw/Makefile b/contrib/postgres_fdw/Makefile
index ee8a80a392..07091f630e 100644
--- a/contrib/postgres_fdw/Makefile
+++ b/contrib/postgres_fdw/Makefile
@@ -29,3 +29,12 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+# Global makefile will do temp-install for 'check'. Since REGRESS is defined,
+# PGXS (included from contrib-global.mk or directly) will care to add
+# postgres_fdw to it as EXTRA_INSTALL and build pg_regress. It will also
+# actually run pg_regress, so the only thing left is tap tests.
+check: tapcheck
+
+tapcheck: temp-install
+	$(prove_check)
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 52d1fe3563..6745b2ae02 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -12,8 +12,10 @@
  */
 #include "postgres.h"
 
+#include "access/global_snapshot.h"
 #include "access/htup_details.h"
 #include "access/xact.h"
+#include "access/xlog.h" /* GetSystemIdentifier() */
 #include "catalog/pg_user_mapping.h"
 #include "commands/defrem.h"
 #include "mb/pg_wchar.h"
@@ -25,6 +27,8 @@
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/snapshot.h"
 #include "utils/syscache.h"
 
 /*
@@ -65,6 +69,21 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/*
+ * FdwTransactionState
+ *
+ * Holds number of open remote transactions and shared state
+ * needed for all connection entries.
+ */
+typedef struct FdwTransactionState
+{
+	char		*gid;
+	int			nparticipants;
+	GlobalCSN	global_csn;
+	bool		two_phase_commit;
+} FdwTransactionState;
+static FdwTransactionState *fdwTransState;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -72,6 +91,9 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/* counter of prepared tx made by this backend */
+static int two_phase_xact_count = 0;
+
 /* prototypes of private functions */
 static PGconn *connect_pg_server(ForeignServer *server, UserMapping *user);
 static void disconnect_pg_server(ConnCacheEntry *entry);
@@ -80,6 +102,7 @@ static void configure_remote_session(PGconn *conn);
 static void do_sql_command(PGconn *conn, const char *sql);
 static void begin_remote_xact(ConnCacheEntry *entry);
 static void pgfdw_xact_callback(XactEvent event, void *arg);
+static void deallocate_prepared_stmts(ConnCacheEntry *entry);
 static void pgfdw_subxact_callback(SubXactEvent event,
 								   SubTransactionId mySubid,
 								   SubTransactionId parentSubid,
@@ -136,6 +159,15 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 									  pgfdw_inval_callback, (Datum) 0);
 	}
 
+	/* allocate FdwTransactionState */
+	if (fdwTransState == NULL)
+	{
+		MemoryContext oldcxt;
+		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+		fdwTransState = palloc0(sizeof(FdwTransactionState));
+		MemoryContextSwitchTo(oldcxt);
+	}
+
 	/* Set flag that we did GetConnection during the current transaction */
 	xact_got_connection = true;
 
@@ -447,7 +479,8 @@ configure_remote_session(PGconn *conn)
 }
 
 /*
- * Convenience subroutine to issue a non-data-returning SQL command to remote
+ * Convenience subroutine to issue a non-data-returning SQL command or
+ * statement to remote node.
  */
 static void
 do_sql_command(PGconn *conn, const char *sql)
@@ -457,7 +490,8 @@ do_sql_command(PGconn *conn, const char *sql)
 	if (!PQsendQuery(conn, sql))
 		pgfdw_report_error(ERROR, NULL, conn, false, sql);
 	res = pgfdw_get_result(conn, sql);
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	if (PQresultStatus(res) != PGRES_COMMAND_OK &&
+			PQresultStatus(res) != PGRES_TUPLES_OK)
 		pgfdw_report_error(ERROR, res, conn, true, sql);
 	PQclear(res);
 }
@@ -485,6 +519,10 @@ begin_remote_xact(ConnCacheEntry *entry)
 		elog(DEBUG3, "starting remote transaction on connection %p",
 			 entry->conn);
 
+		if (UseGlobalSnapshots && (!IsolationUsesXactSnapshot() ||
+								   IsolationIsSerializable()))
+			elog(ERROR, "Global snapshots support only REPEATABLE READ");
+
 		if (IsolationIsSerializable())
 			sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
 		else
@@ -493,6 +531,23 @@ begin_remote_xact(ConnCacheEntry *entry)
 		do_sql_command(entry->conn, sql);
 		entry->xact_depth = 1;
 		entry->changing_xact_state = false;
+
+		if (UseGlobalSnapshots)
+		{
+			char import_sql[128];
+
+			/* Export our snapshot */
+			if (fdwTransState->global_csn == 0)
+				fdwTransState->global_csn = ExportGlobalSnapshot();
+
+			snprintf(import_sql, sizeof(import_sql),
+				"SELECT pg_global_snapshot_import("UINT64_FORMAT")",
+				fdwTransState->global_csn);
+
+			do_sql_command(entry->conn, import_sql);
+		}
+
+		fdwTransState->nparticipants += 1;
 	}
 
 	/*
@@ -700,6 +755,94 @@ pgfdw_report_error(int elevel, PGresult *res, PGconn *conn,
 	PG_END_TRY();
 }
 
+/* Callback typedef for BroadcastStmt */
+typedef bool (*BroadcastCmdResHandler) (PGresult *result, void *arg);
+
+/* Broadcast sql in parallel to all ConnectionHash entries */
+static bool
+BroadcastStmt(char const * sql, unsigned expectedStatus,
+				BroadcastCmdResHandler handler, void *arg)
+{
+	HASH_SEQ_STATUS scan;
+	ConnCacheEntry *entry;
+	bool		allOk = true;
+
+	/* Broadcast sql */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		pgfdw_reject_incomplete_xact_state_change(entry);
+
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			if (!PQsendQuery(entry->conn, sql))
+			{
+				PGresult   *res = PQgetResult(entry->conn);
+
+				elog(WARNING, "Failed to send command %s", sql);
+				pgfdw_report_error(WARNING, res, entry->conn, true, sql);
+				PQclear(res);
+			}
+		}
+	}
+
+	/* Collect responses */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			PGresult   *result = PQgetResult(entry->conn);
+
+			if (PQresultStatus(result) != expectedStatus ||
+				(handler && !handler(result, arg)))
+			{
+				elog(WARNING, "Failed command %s: status=%d, expected status=%d", sql, PQresultStatus(result), expectedStatus);
+				pgfdw_report_error(ERROR, result, entry->conn, true, sql);
+				allOk = false;
+			}
+			PQclear(result);
+			PQgetResult(entry->conn);	/* consume NULL result */
+		}
+	}
+
+	return allOk;
+}
+
+/* Wrapper for broadcasting commands */
+static bool
+BroadcastCmd(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_COMMAND_OK, NULL, NULL);
+}
+
+/* Wrapper for broadcasting statements */
+static bool
+BroadcastFunc(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_TUPLES_OK, NULL, NULL);
+}
+
+/* Callback for selecting maximal csn */
+static bool
+MaxCsnCB(PGresult *result, void *arg)
+{
+	char		   *resp;
+	GlobalCSN	   *max_csn = (GlobalCSN *) arg;
+	GlobalCSN		csn = 0;
+
+	resp = PQgetvalue(result, 0, 0);
+
+	if (resp == NULL || (*resp) == '\0' ||
+			sscanf(resp, UINT64_FORMAT, &csn) != 1)
+		return false;
+
+	if (*max_csn < csn)
+		*max_csn = csn;
+
+	return true;
+}
+
 /*
  * pgfdw_xact_callback --- cleanup at main-transaction end.
  */
@@ -713,6 +856,86 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	if (!xact_got_connection)
 		return;
 
+	/* Handle possible two-phase commit */
+	if (event == XACT_EVENT_PARALLEL_PRE_COMMIT || event == XACT_EVENT_PRE_COMMIT)
+	{
+		bool include_local_tx = false;
+
+		/* Should we take into account this node? */
+		if (TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		{
+			include_local_tx = true;
+			fdwTransState->nparticipants += 1;
+		}
+
+		/* Switch to 2PC mode there were more than one participant */
+		if (UseGlobalSnapshots && fdwTransState->nparticipants > 1)
+			fdwTransState->two_phase_commit = true;
+
+		if (fdwTransState->two_phase_commit)
+		{
+			GlobalCSN	max_csn = InProgressGlobalCSN,
+						my_csn = InProgressGlobalCSN;
+			bool	res;
+			char   *sql;
+
+			fdwTransState->gid = psprintf("pgfdw:%lld:%llu:%d:%u:%d:%d",
+										  (long long) GetCurrentTimestamp(),
+										  (long long) GetSystemIdentifier(),
+										  MyProcPid,
+										  GetCurrentTransactionIdIfAny(),
+										  ++two_phase_xact_count,
+										  fdwTransState->nparticipants);
+
+			/* Broadcast PREPARE */
+			sql = psprintf("PREPARE TRANSACTION '%s'", fdwTransState->gid);
+			res = BroadcastCmd(sql);
+			if (!res)
+				goto error;
+
+			/* Broadcast pg_global_snapshot_prepare() */
+			if (include_local_tx)
+				my_csn = GlobalSnapshotPrepareCurrent();
+
+			sql = psprintf("SELECT pg_global_snapshot_prepare('%s')",
+														fdwTransState->gid);
+			res = BroadcastStmt(sql, PGRES_TUPLES_OK, MaxCsnCB, &max_csn);
+			if (!res)
+				goto error;
+
+			/* select maximal global csn */
+			if (include_local_tx && my_csn > max_csn)
+				max_csn = my_csn;
+
+			/* Broadcast pg_global_snapshot_assign() */
+			if (include_local_tx)
+				GlobalSnapshotAssignCsnCurrent(max_csn);
+			sql = psprintf("SELECT pg_global_snapshot_assign('%s',"UINT64_FORMAT")",
+							fdwTransState->gid, max_csn);
+			res = BroadcastFunc(sql);
+
+error:
+			if (!res)
+			{
+				sql = psprintf("ABORT PREPARED '%s'", fdwTransState->gid);
+				BroadcastCmd(sql);
+				elog(ERROR, "Failed to PREPARE transaction on remote node");
+			}
+
+			/*
+			 * Do not fall down. Consequent COMMIT event will clean thing up.
+			 */
+			return;
+		}
+	}
+
+	/* COMMIT open transaction of we were doing 2PC */
+	if (fdwTransState->two_phase_commit &&
+		(event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+	{
+		BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid));
+	}
+
 	/*
 	 * Scan all connection cache entries to find open remote transactions, and
 	 * close them.
@@ -720,8 +943,6 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	hash_seq_init(&scan, ConnectionHash);
 	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
 	{
-		PGresult   *res;
-
 		/* Ignore cache entry if no open connection right now */
 		if (entry->conn == NULL)
 			continue;
@@ -738,6 +959,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 			{
 				case XACT_EVENT_PARALLEL_PRE_COMMIT:
 				case XACT_EVENT_PRE_COMMIT:
+					Assert(!fdwTransState->two_phase_commit);
 
 					/*
 					 * If abort cleanup previously failed for this connection,
@@ -750,28 +972,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					do_sql_command(entry->conn, "COMMIT TRANSACTION");
 					entry->changing_xact_state = false;
 
-					/*
-					 * If there were any errors in subtransactions, and we
-					 * made prepared statements, do a DEALLOCATE ALL to make
-					 * sure we get rid of all prepared statements. This is
-					 * annoying and not terribly bulletproof, but it's
-					 * probably not worth trying harder.
-					 *
-					 * DEALLOCATE ALL only exists in 8.3 and later, so this
-					 * constrains how old a server postgres_fdw can
-					 * communicate with.  We intentionally ignore errors in
-					 * the DEALLOCATE, so that we can hobble along to some
-					 * extent with older servers (leaking prepared statements
-					 * as we go; but we don't really support update operations
-					 * pre-8.3 anyway).
-					 */
-					if (entry->have_prep_stmt && entry->have_error)
-					{
-						res = PQexec(entry->conn, "DEALLOCATE ALL");
-						PQclear(res);
-					}
-					entry->have_prep_stmt = false;
-					entry->have_error = false;
+					deallocate_prepared_stmts(entry);
 					break;
 				case XACT_EVENT_PRE_PREPARE:
 
@@ -790,6 +991,11 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					break;
 				case XACT_EVENT_PARALLEL_COMMIT:
 				case XACT_EVENT_COMMIT:
+					if (fdwTransState->two_phase_commit)
+						deallocate_prepared_stmts(entry);
+					else /* Pre-commit should have closed the open transaction */
+						elog(ERROR, "missed cleaning up connection during pre-commit");
+					break;
 				case XACT_EVENT_PREPARE:
 					/* Pre-commit should have closed the open transaction */
 					elog(ERROR, "missed cleaning up connection during pre-commit");
@@ -885,6 +1091,38 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Reset fdwTransState */
+	memset(fdwTransState, '\0', sizeof(FdwTransactionState));
+}
+
+/*
+ * If there were any errors in subtransactions, and we
+ * made prepared statements, do a DEALLOCATE ALL to make
+ * sure we get rid of all prepared statements. This is
+ * annoying and not terribly bulletproof, but it's
+ * probably not worth trying harder.
+ *
+ * DEALLOCATE ALL only exists in 8.3 and later, so this
+ * constrains how old a server postgres_fdw can
+ * communicate with.  We intentionally ignore errors in
+ * the DEALLOCATE, so that we can hobble along to some
+ * extent with older servers (leaking prepared statements
+ * as we go; but we don't really support update operations
+ * pre-8.3 anyway).
+ */
+static void
+deallocate_prepared_stmts(ConnCacheEntry *entry)
+{
+	PGresult   *res;
+
+	if (entry->have_prep_stmt && entry->have_error)
+	{
+		res = PQexec(entry->conn, "DEALLOCATE ALL");
+		PQclear(res);
+	}
+	entry->have_prep_stmt = false;
+	entry->have_error = false;
 }
 
 /*
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 9fc53cad68..03c5b0093a 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -301,6 +301,9 @@ typedef struct
 	List	   *already_used;	/* expressions already dealt with */
 } ec_member_foreign_arg;
 
+bool		UseGlobalSnapshots;
+void		_PG_init(void);
+
 /*
  * SQL functions
  */
@@ -6584,3 +6587,12 @@ find_em_expr_for_input_target(PlannerInfo *root,
 	elog(ERROR, "could not find pathkey item to sort");
 	return NULL;				/* keep compiler quiet */
 }
+
+void
+_PG_init(void)
+{
+	DefineCustomBoolVariable("postgres_fdw.use_global_snapshots",
+							 "Use global snapshots for FDW transactions", NULL,
+							 &UseGlobalSnapshots, false, PGC_USERSET, 0, NULL,
+							 NULL, NULL);
+}
diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h
index eef410db39..9d3ea077a1 100644
--- a/contrib/postgres_fdw/postgres_fdw.h
+++ b/contrib/postgres_fdw/postgres_fdw.h
@@ -208,4 +208,6 @@ extern const char *get_jointype_name(JoinType jointype);
 extern bool is_builtin(Oid objectId);
 extern bool is_shippable(Oid objectId, Oid classId, PgFdwRelationInfo *fpinfo);
 
+extern bool UseGlobalSnapshots;
+
 #endif							/* POSTGRES_FDW_H */
diff --git a/contrib/postgres_fdw/t/001_bank_coordinator.pl b/contrib/postgres_fdw/t/001_bank_coordinator.pl
new file mode 100644
index 0000000000..1e31f33349
--- /dev/null
+++ b/contrib/postgres_fdw/t/001_bank_coordinator.pl
@@ -0,0 +1,264 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $master = get_new_node("master");
+$master->init;
+$master->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	log_checkpoints = true
+	postgres_fdw.use_global_snapshots = on
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$master->start;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+$master->safe_psql('postgres', qq[
+	CREATE EXTENSION postgres_fdw;
+	CREATE TABLE accounts(id integer primary key, amount integer);
+	CREATE TABLE global_transactions(tx_time timestamp);
+]);
+
+foreach my $node ($shard1, $shard2)
+{
+	my $port = $node->port;
+	my $host = $node->host;
+
+	$node->safe_psql('postgres',
+			"CREATE TABLE accounts(id integer primary key, amount integer)");
+
+	$master->safe_psql('postgres', qq[
+		CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw options(dbname 'postgres', host '$host', port '$port');
+		CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts) server shard_$port options(table_name 'accounts');
+		CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+	])
+}
+
+$shard1->safe_psql('postgres', qq[
+	insert into accounts select 2*id-1, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+$shard2->safe_psql('postgres', qq[
+	insert into accounts select 2*id, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+diag("master: @{[$master->connstr('postgres')]}");
+diag("shard1: @{[$shard1->connstr('postgres')]}");
+diag("shard2: @{[$shard2->connstr('postgres')]}");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+my $bank1 = File::Temp->new();
+append_to_file($bank1, q{
+	\set id random(1, 10000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = (2*:id + 1) RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 3);
+	COMMIT;
+});
+
+my $bank2 = File::Temp->new();
+append_to_file($bank2, q{
+	\set id random(1, 10000)
+
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = 2*:id RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 2);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+
+
+my $pgb_handle;
+
+$pgb_handle = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+$master->pgbench_await($pgb_handle);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# Concurrent global and local transactions
+###############################################################################
+
+my ($pgb_handle1, $pgb_handle2, $pgb_handle3);
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$started = time();
+$selects = 0;
+$oldtotal = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+diag("selects = $selects");
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+diag("completed $selects selects");
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global and local transactions');
+
+
+###############################################################################
+# Snapshot stability
+###############################################################################
+
+my ($hashes, $hash1, $hash2);
+my $stability_errors = 0;
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$selects = 0;
+$started = time();
+while (time() - $started < $seconds)
+{
+	foreach my $node ($master, $shard1, $shard2)
+	{
+		($hash1, $_, $hash2) = split "\n", $node->safe_psql('postgres', qq[
+			begin isolation level repeatable read;
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			select pg_sleep(3);
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			commit;
+		]);
+
+		if ($hash1 ne $hash2)
+		{
+			diag("oops");
+			$stability_errors++;
+		}
+		elsif ($hash1 eq '' or $hash2 eq '')
+		{
+			die;
+		}
+		else
+		{
+			$selects++;
+		}
+	}
+}
+
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($stability_errors, 0, 'snapshot is stable during concurrent global and local transactions');
+
+$master->stop;
+$shard1->stop;
+$shard2->stop;
diff --git a/contrib/postgres_fdw/t/002_bank_participant.pl b/contrib/postgres_fdw/t/002_bank_participant.pl
new file mode 100644
index 0000000000..04a2f1ba85
--- /dev/null
+++ b/contrib/postgres_fdw/t/002_bank_participant.pl
@@ -0,0 +1,240 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_global_snapshots = on
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_global_snapshots = on
+	global_snapshot_defer_time = 15
+	track_global_snapshots = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+my @shards = ($shard1, $shard2);
+
+foreach my $node (@shards)
+{
+	$node->safe_psql('postgres', qq[
+		CREATE EXTENSION postgres_fdw;
+		CREATE TABLE accounts(id integer primary key, amount integer);
+		CREATE TABLE accounts_local() inherits(accounts);
+		CREATE TABLE global_transactions(tx_time timestamp);
+		CREATE TABLE local_transactions(tx_time timestamp);
+	]);
+
+	foreach my $neighbor (@shards)
+	{
+		next if ($neighbor eq $node);
+
+		my $port = $neighbor->port;
+		my $host = $neighbor->host;
+
+		$node->safe_psql('postgres', qq[
+			CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw
+					options(dbname 'postgres', host '$host', port '$port');
+			CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts)
+					server shard_$port options(table_name 'accounts_local');
+			CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+		]);
+	}
+}
+
+$shard1->psql('postgres', "insert into accounts_local select 2*id-1, 0 from generate_series(1, 10010) as id;");
+$shard2->psql('postgres', "insert into accounts_local select 2*id,   0 from generate_series(1, 10010) as id;");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+my $i;
+
+
+my ($pgb_handle1, $pgb_handle2);
+
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# And do the same after soft restart
+###############################################################################
+
+$shard1->restart;
+$shard2->restart;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after restart');
+
+###############################################################################
+# And do the same after hard restart
+###############################################################################
+
+$shard1->teardown_node;
+$shard2->teardown_node;
+$shard1->start;
+$shard2->start;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after hard restart');
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 1407359aef..247a21155f 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -2129,6 +2129,41 @@ sub pg_recvlogical_upto
 	}
 }
 
+sub pgbench()
+{
+	my ($self, $node, @args) = @_;
+	my $pgbench_handle = $self->pgbench_async($node, @args);
+	$self->pgbench_await($pgbench_handle);
+}
+
+sub pgbench_async()
+{
+	my ($self, @args) = @_;
+
+	my ($in, $out, $err, $rc);
+	$in = '';
+	$out = '';
+
+	my @pgbench_command = (
+		'pgbench',
+		-h => $self->host,
+		-p => $self->port,
+		@args
+	);
+	my $handle = IPC::Run::start(\@pgbench_command, $in, $out);
+	return $handle;
+}
+
+sub pgbench_await()
+{
+	my ($self, $pgbench_handle) = @_;
+
+	# During run some pgbench threads can exit (for example due to
+	# serialization error). That will set non-zero returning code.
+	# So don't check return code here and leave it to a caller.
+	my $rc = IPC::Run::finish($pgbench_handle);
+}
+
 =pod
 
 =back
-- 
2.25.1

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Andrey V. Lepikhov (#3)

On Wed, Jun 10, 2020 at 8:36 AM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 09.06.2020 11:41, Fujii Masao wrote:

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Not now. It is a sharding-related feature. I'm not sure that this
approach is fully consistent with the sharding way now.

Can you please explain in detail, why you think so? There is no
commit message explaining what each patch does so it is difficult to
understand why you said so? Also, can you let us know if this
supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1]/messages/by-id/CA+fd4k4v+KdofMyN+jnOia8-7rto8tsh9Zs3dd7kncvHp12WYw@mail.gmail.com is trying to achieve? Also, I
would like to know if the patch related to CSN based snapshot [2]/messages/by-id/2020061911294657960322@highgo.ca is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2]/messages/by-id/2020061911294657960322@highgo.ca which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

Basically, there seem to be three threads, first, this one and then
[1]: /messages/by-id/CA+fd4k4v+KdofMyN+jnOia8-7rto8tsh9Zs3dd7kncvHp12WYw@mail.gmail.com
there is no clear explanation anywhere if these are anyway related or
whether combining all these three we are aiming for a solution for
atomic commit and atomic visibility.

I am not sure if you know answers to all these questions so I added
the people who seem to be working on the other two patches. I am also
afraid that if there is any duplicate or conflicting work going on in
these threads so we should try to find that as well.

[1]: /messages/by-id/CA+fd4k4v+KdofMyN+jnOia8-7rto8tsh9Zs3dd7kncvHp12WYw@mail.gmail.com
[2]: /messages/by-id/2020061911294657960322@highgo.ca

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

over 5 years ago

In reply to: Amit Kapila (#4)

On 6/19/20 11:48 AM, Amit Kapila wrote:

On Wed, Jun 10, 2020 at 8:36 AM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 09.06.2020 11:41, Fujii Masao wrote:

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Not now. It is a sharding-related feature. I'm not sure that this
approach is fully consistent with the sharding way now.

Can you please explain in detail, why you think so? There is no
commit message explaining what each patch does so it is difficult to
understand why you said so?

For now I used this patch set for providing correct visibility in the
case of access to the table with foreign partitions from many nodes in
parallel. So I saw at this patch set as a sharding-related feature, but
[1]: /messages/by-id/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
CSN-based approach has weak points such as:
1. Dependency on clocks synchronization
2. Needs guarantees of monotonically increasing of the CSN in the case
of an instance restart/crash etc.
3. We need to delay increasing of OldestXmin because it can be needed
for a transaction snapshot at another node.
So I do not have full conviction that it will be better than a single
distributed transaction manager.
Also, can you let us know if this

supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1] is trying to achieve?

Yes, the patch '0003-postgres_fdw-support-for-global-snapshots' contains
2PC machinery. Now I'd not judge which approach is better.
Also, I

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch. At first sight it is different.

Basically, there seem to be three threads, first, this one and then
[1] and [2] which seems to be doing the work for sharding feature but
there is no clear explanation anywhere if these are anyway related or
whether combining all these three we are aiming for a solution for
atomic commit and atomic visibility.

It can be useful to study all approaches.

I am not sure if you know answers to all these questions so I added
the people who seem to be working on the other two patches. I am also
afraid that if there is any duplicate or conflicting work going on in
these threads so we should try to find that as well.

[1] - /messages/by-id/CA+fd4k4v+KdofMyN+jnOia8-7rto8tsh9Zs3dd7kncvHp12WYw@mail.gmail.com
[2] - /messages/by-id/2020061911294657960322@highgo.ca

[1]: /messages/by-id/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
/messages/by-id/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de

--
Andrey Lepikhov
Postgres Professional
https://postgrespro.com

movead.li@highgo.ca

over 5 years ago

In reply to: Andrey Lepikhov (#1)

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch. At first sight it is different.

This patch[2] is almost base on [3]/messages/by-id/21BC916B-80A1-43BF-8650-3363CCDAE09C@postgrespro.ru, because I think [1] is talking about 2PC
and FDW, so this patch focus on CSN only and I detach the global snapshot
part and FDW part from the [1] patch.

I notice CSN will not survival after a restart in [1] patch, I think it may not the
right way, may be it is what in last mail "Needs guarantees of monotonically
increasing of the CSN in the case of an instance restart/crash etc" so I try to
add wal support for CSN on this patch.

That's why this thread exist.

[1] - /messages/by-id/CA+fd4k4v+KdofMyN+jnOia8-7rto8tsh9Zs3dd7kncvHp12WYw@mail.gmail.com
[2] - /messages/by-id/2020061911294657960322@highgo.ca

[3]: /messages/by-id/21BC916B-80A1-43BF-8650-3363CCDAE09C@postgrespro.ru

Regards,
Highgo Software (Canada/China/Pakistan)
URL : www.highgo.ca
EMAIL: mailto:movead(dot)li(at)highgo(dot)ca

Bruce Momjian

bruce@momjian.us

over 5 years ago

In reply to: movead.li@highgo.ca (#6)

On Fri, Jun 19, 2020 at 05:03:20PM +0800, movead.li@highgo.ca wrote:

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch.. At first sight it is different.

This patch[2] is almost base on [3], because I think [1] is talking about 2PC
and FDW, so this patch focus on CSN only and I detach the global snapshot
part and FDW part from the [1] patch.

I notice CSN will not survival after a restart in [1] patch, I think it may not
the
right way, may be it is what in last mail "Needs guarantees of monotonically
increasing of the CSN in the case of an instance restart/crash etc" so I try to
add wal support for CSN on this patch.

That's why this thread exist.

I was certainly missing how these items fit together. Sharding needs
parallel FDWs, atomic commits, and atomic snapshots. To get atomic
snapshots, we need CSN. This new sharding wiki pages has more details:

https://wiki.postgresql.org/wiki/WIP_PostgreSQL_Sharding

After all that is done, we will need optimizer improvements and shard
management tooling.

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Andrey V. Lepikhov (#5)

On Fri, Jun 19, 2020 at 1:42 PM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 6/19/20 11:48 AM, Amit Kapila wrote:

On Wed, Jun 10, 2020 at 8:36 AM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 09.06.2020 11:41, Fujii Masao wrote:

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Not now. It is a sharding-related feature. I'm not sure that this
approach is fully consistent with the sharding way now.

Can you please explain in detail, why you think so? There is no
commit message explaining what each patch does so it is difficult to
understand why you said so?

For now I used this patch set for providing correct visibility in the
case of access to the table with foreign partitions from many nodes in
parallel. So I saw at this patch set as a sharding-related feature, but
[1] shows another useful application.
CSN-based approach has weak points such as:
1. Dependency on clocks synchronization
2. Needs guarantees of monotonically increasing of the CSN in the case
of an instance restart/crash etc.
3. We need to delay increasing of OldestXmin because it can be needed
for a transaction snapshot at another node.

So, is anyone working on improving these parts of the patch. AFAICS
from what Bruce has shared [1], some people from HighGo are working on
it but I don't see any discussion of that yet.

So I do not have full conviction that it will be better than a single
distributed transaction manager.

When you say "single distributed transaction manager" do you mean
something like pg_dtm which is inspired by Postgres-XL?

Also, can you let us know if this

supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1] is trying to achieve?

Yes, the patch '0003-postgres_fdw-support-for-global-snapshots' contains
2PC machinery. Now I'd not judge which approach is better.

Yeah, I have studied both the approaches a little and I feel the main
difference seems to be that in this patch atomicity is tightly coupled
with how we achieve global visibility, basically in this patch "all
running transactions are marked as InDoubt on all nodes in prepare
phase, and after that, each node commit it and stamps each xid with a
given GlobalCSN.". There are no separate APIs for
prepare/commit/rollback exposed by postgres_fdw as we do it in the
approach followed by Sawada-San's patch. It seems to me in the patch
in this email one of postgres_fdw node can be a sort of coordinator
which prepares and commit the transaction on all other nodes whereas
that is not true in Sawada-San's patch (where the coordinator is a
local Postgres node, am I right Sawada-San?). OTOH, Sawada-San's
patch has advanced concepts like a resolver process that can
commit/abort the transactions later. I couldn't still get a complete
grip of both patches so difficult to say which is better approach but
I think at the least we should have some discussion.

I feel if Sawada-San or someone involved in another patch also once
studies this approach and try to come up with some form of comparison
then we might be able to make better decision. It is possible that
there are few good things in each approach which we can use.

Also, I

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch. At first sight it is different.

I feel the opposite. I think it has extracted some stuff from this
patch series and extended the same.

Thanks for the inputs. I feel inputs from you and others who were
involved in this project will be really helpful to move this project
forward.

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Amit Kapila (#8)

On Sat, Jun 20, 2020 at 5:51 PM Amit Kapila <amit.kapila16@gmail.com> wrote:

So, is anyone working on improving these parts of the patch. AFAICS
from what Bruce has shared [1],

oops, forgot to share the link [1] -
https://wiki.postgresql.org/wiki/WIP_PostgreSQL_Sharding

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#10

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Bruce Momjian (#7)

On Fri, Jun 19, 2020 at 6:33 PM Bruce Momjian <bruce@momjian.us> wrote:

On Fri, Jun 19, 2020 at 05:03:20PM +0800, movead.li@highgo.ca wrote:

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch.. At first sight it is different.

This patch[2] is almost base on [3], because I think [1] is talking about 2PC
and FDW, so this patch focus on CSN only and I detach the global snapshot
part and FDW part from the [1] patch.

I notice CSN will not survival after a restart in [1] patch, I think it may not
the
right way, may be it is what in last mail "Needs guarantees of monotonically
increasing of the CSN in the case of an instance restart/crash etc" so I try to
add wal support for CSN on this patch.

That's why this thread exist.

I was certainly missing how these items fit together. Sharding needs
parallel FDWs, atomic commits, and atomic snapshots. To get atomic
snapshots, we need CSN. This new sharding wiki pages has more details:

https://wiki.postgresql.org/wiki/WIP_PostgreSQL_Sharding

Thanks for maintaining this page. It is quite helpful!

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#11

Bruce Momjian

bruce@momjian.us

over 5 years ago

In reply to: Amit Kapila (#10)

On Sat, Jun 20, 2020 at 05:54:18PM +0530, Amit Kapila wrote:

On Fri, Jun 19, 2020 at 6:33 PM Bruce Momjian <bruce@momjian.us> wrote:

On Fri, Jun 19, 2020 at 05:03:20PM +0800, movead.li@highgo.ca wrote:

would like to know if the patch related to CSN based snapshot [2] is a
precursor for this, if not, then is it any way related to this patch
because I see the latest reply on that thread [2] which says it is an
infrastructure of sharding feature but I don't understand completely
whether these patches are related?

I need some time to study this patch.. At first sight it is different.

This patch[2] is almost base on [3], because I think [1] is talking about 2PC
and FDW, so this patch focus on CSN only and I detach the global snapshot
part and FDW part from the [1] patch.

I notice CSN will not survival after a restart in [1] patch, I think it may not
the
right way, may be it is what in last mail "Needs guarantees of monotonically
increasing of the CSN in the case of an instance restart/crash etc" so I try to
add wal support for CSN on this patch.

That's why this thread exist.

I was certainly missing how these items fit together. Sharding needs
parallel FDWs, atomic commits, and atomic snapshots. To get atomic
snapshots, we need CSN. This new sharding wiki pages has more details:

https://wiki.postgresql.org/wiki/WIP_PostgreSQL_Sharding

Thanks for maintaining this page. It is quite helpful!

Ahsan Hadi <ahsan.hadi@highgo.ca> created that page, and I just made a
few wording edits. Ahsan is copying information from this older
sharding wiki page:

https://wiki.postgresql.org/wiki/Built-in_Sharding

to the new one you listed above.

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

#12

Bruce Momjian

bruce@momjian.us

over 5 years ago

In reply to: Amit Kapila (#8)

On Sat, Jun 20, 2020 at 05:51:21PM +0530, Amit Kapila wrote:

I feel if Sawada-San or someone involved in another patch also once
studies this approach and try to come up with some form of comparison
then we might be able to make better decision. It is possible that
there are few good things in each approach which we can use.

Agreed. Postgres-XL code is under the Postgres license:

Postgres-XL is released under the PostgreSQL License, a liberal Open
Source license, similar to the BSD or MIT licenses.

and even says they want it moved into Postgres core:

https://www.postgres-xl.org/2017/08/postgres-xl-9-5-r1-6-announced/

Postgres-XL is a massively parallel database built on top of,
and very closely compatible with PostgreSQL 9.5 and its set of advanced
features. Postgres-XL is fully open source and many parts of it will
feed back directly or indirectly into later releases of PostgreSQL, as
we begin to move towards a fully parallel sharded version of core PostgreSQL.

so we should understand what can be used from it.

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

#13

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Bruce Momjian (#12)

On Mon, Jun 22, 2020 at 8:36 PM Bruce Momjian <bruce@momjian.us> wrote:

On Sat, Jun 20, 2020 at 05:51:21PM +0530, Amit Kapila wrote:

I feel if Sawada-San or someone involved in another patch also once
studies this approach and try to come up with some form of comparison
then we might be able to make better decision. It is possible that
there are few good things in each approach which we can use.

Agreed. Postgres-XL code is under the Postgres license:

Postgres-XL is released under the PostgreSQL License, a liberal Open
Source license, similar to the BSD or MIT licenses.

and even says they want it moved into Postgres core:

https://www.postgres-xl.org/2017/08/postgres-xl-9-5-r1-6-announced/

Postgres-XL is a massively parallel database built on top of,
and very closely compatible with PostgreSQL 9.5 and its set of advanced
features. Postgres-XL is fully open source and many parts of it will
feed back directly or indirectly into later releases of PostgreSQL, as
we begin to move towards a fully parallel sharded version of core PostgreSQL.

so we should understand what can be used from it.

+1. I think that will be quite useful.

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#14

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

over 5 years ago

In reply to: Amit Kapila (#8)

On Sat, 20 Jun 2020 at 21:21, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Jun 19, 2020 at 1:42 PM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 6/19/20 11:48 AM, Amit Kapila wrote:

On Wed, Jun 10, 2020 at 8:36 AM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

On 09.06.2020 11:41, Fujii Masao wrote:

The patches seem not to be registered in CommitFest yet.
Are you planning to do that?

Not now. It is a sharding-related feature. I'm not sure that this
approach is fully consistent with the sharding way now.

Can you please explain in detail, why you think so? There is no
commit message explaining what each patch does so it is difficult to
understand why you said so?

For now I used this patch set for providing correct visibility in the
case of access to the table with foreign partitions from many nodes in
parallel. So I saw at this patch set as a sharding-related feature, but
[1] shows another useful application.
CSN-based approach has weak points such as:
1. Dependency on clocks synchronization
2. Needs guarantees of monotonically increasing of the CSN in the case
of an instance restart/crash etc.
3. We need to delay increasing of OldestXmin because it can be needed
for a transaction snapshot at another node.

So, is anyone working on improving these parts of the patch. AFAICS
from what Bruce has shared [1], some people from HighGo are working on
it but I don't see any discussion of that yet.

So I do not have full conviction that it will be better than a single
distributed transaction manager.

When you say "single distributed transaction manager" do you mean
something like pg_dtm which is inspired by Postgres-XL?

Also, can you let us know if this

supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1] is trying to achieve?

Yes, the patch '0003-postgres_fdw-support-for-global-snapshots' contains
2PC machinery. Now I'd not judge which approach is better.

Sorry for being late.

Yeah, I have studied both the approaches a little and I feel the main
difference seems to be that in this patch atomicity is tightly coupled
with how we achieve global visibility, basically in this patch "all
running transactions are marked as InDoubt on all nodes in prepare
phase, and after that, each node commit it and stamps each xid with a
given GlobalCSN.". There are no separate APIs for
prepare/commit/rollback exposed by postgres_fdw as we do it in the
approach followed by Sawada-San's patch. It seems to me in the patch
in this email one of postgres_fdw node can be a sort of coordinator
which prepares and commit the transaction on all other nodes whereas
that is not true in Sawada-San's patch (where the coordinator is a
local Postgres node, am I right Sawada-San?).

Yeah, where to manage foreign transactions is different: postgres_fdw
manages foreign transactions in this patch whereas the PostgreSQL core
does that in that 2PC patch.

I feel if Sawada-San or someone involved in another patch also once
studies this approach and try to come up with some form of comparison
then we might be able to make better decision. It is possible that
there are few good things in each approach which we can use.

I studied this patch and did a simple comparison between this patch
(0002 patch) and my 2PC patch.

In terms of atomic commit, the features that are not implemented in
this patch but in the 2PC patch are:

* Crash safe.
* PREPARE TRANSACTION command support.
* Query cancel during waiting for the commit.
* Automatically in-doubt transaction resolution.

On the other hand, the feature that is implemented in this patch but
not in the 2PC patch is:

* Executing PREPARE TRANSACTION (and other commands) in parallel

When the 2PC patch was proposed, IIRC it was like this patch (0002
patch). I mean, it changed only postgres_fdw to support 2PC. But after
discussion, we changed the approach to have the core manage foreign
transaction for crash-safe. From my perspective, this patch has a
minimum implementation of 2PC to work the global snapshot feature and
has some missing features important for supporting crash-safe atomic
commit. So I personally think we should consider how to integrate this
global snapshot feature with the 2PC patch, rather than improving this
patch if we want crash-safe atomic commit.

Looking at the commit procedure with this patch:

When starting a new transaction on a foreign server, postgres_fdw
executes pg_global_snapshot_import() to import the global snapshot.
After some work, in pre-commit phase we do:

1. generate global transaction id, say 'gid'
2. execute PREPARE TRANSACTION 'gid' on all participants.
3. prepare global snapshot locally, if the local node also involves
the transaction
4. execute pg_global_snapshot_prepare('gid') for all participants

During step 2 to 4, we calculate the maximum CSN from the CSNs
returned from each pg_global_snapshot_prepare() executions.

5. assign global snapshot locally, if the local node also involves the
transaction
6. execute pg_global_snapshot_assign('gid', max-csn) on all participants.

Then, we commit locally (i.g. mark the current transaction as
committed in clog).

After that, in post-commit phase, execute COMMIT PREPARED 'gid' on all
participants.

Considering how to integrate this global snapshot feature with the 2PC
patch, what the 2PC patch needs to at least change is to allow FDW to
store an FDW-private data that is passed to subsequent FDW transaction
API calls. Currently, in the current 2PC patch, we call Prepare API
for each participant servers one by one, and the core pass only
metadata such as ForeignServer, UserMapping, and global transaction
identifier. So it's not easy to calculate the maximum CSN across
multiple transaction API calls. I think we can change the 2PC patch to
add a void pointer into FdwXactRslvState, struct passed from the core,
in order to store FDW-private data. It's going to be the maximum CSN
in this case. That way, at the first Prepare API calls postgres_fdw
allocates the space and stores CSN to that space. And at subsequent
Prepare API calls it can calculate the maximum of csn, and then is
able to the step 3 to 6 when preparing the transaction on the last
participant. Another idea would be to change 2PC patch so that the
core passes a bunch of participants grouped by FDW.

I’ve not read this patch deeply yet and have considered it without any
coding but my first feeling is not hard to integrate this feature with
the 2PC patch.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#15

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Masahiko Sawada (#14)

On Fri, Jul 3, 2020 at 12:18 PM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Sat, 20 Jun 2020 at 21:21, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Jun 19, 2020 at 1:42 PM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

Also, can you let us know if this

supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1] is trying to achieve?

Yes, the patch '0003-postgres_fdw-support-for-global-snapshots' contains
2PC machinery. Now I'd not judge which approach is better.

Sorry for being late.

No problem, your summarization, and comparisons of both approaches are
quite helpful.

I studied this patch and did a simple comparison between this patch
(0002 patch) and my 2PC patch.

In terms of atomic commit, the features that are not implemented in
this patch but in the 2PC patch are:

* Crash safe.
* PREPARE TRANSACTION command support.
* Query cancel during waiting for the commit.
* Automatically in-doubt transaction resolution.

On the other hand, the feature that is implemented in this patch but
not in the 2PC patch is:

* Executing PREPARE TRANSACTION (and other commands) in parallel

When the 2PC patch was proposed, IIRC it was like this patch (0002
patch). I mean, it changed only postgres_fdw to support 2PC. But after
discussion, we changed the approach to have the core manage foreign
transaction for crash-safe. From my perspective, this patch has a
minimum implementation of 2PC to work the global snapshot feature and
has some missing features important for supporting crash-safe atomic
commit. So I personally think we should consider how to integrate this
global snapshot feature with the 2PC patch, rather than improving this
patch if we want crash-safe atomic commit.

Okay, but isn't there some advantage with this approach (manage 2PC at
postgres_fdw level) as well which is that any node will be capable of
handling global transactions rather than doing them via central
coordinator? I mean any node can do writes or reads rather than
probably routing them (at least writes) via coordinator node. Now, I
agree that even if this advantage is there in the current approach, we
can't lose the crash-safety aspect of other approach. Will you be
able to summarize what was the problem w.r.t crash-safety and how your
patch has dealt it?

Looking at the commit procedure with this patch:

When starting a new transaction on a foreign server, postgres_fdw
executes pg_global_snapshot_import() to import the global snapshot.
After some work, in pre-commit phase we do:

1. generate global transaction id, say 'gid'
2. execute PREPARE TRANSACTION 'gid' on all participants.
3. prepare global snapshot locally, if the local node also involves
the transaction
4. execute pg_global_snapshot_prepare('gid') for all participants

During step 2 to 4, we calculate the maximum CSN from the CSNs
returned from each pg_global_snapshot_prepare() executions.

5. assign global snapshot locally, if the local node also involves the
transaction
6. execute pg_global_snapshot_assign('gid', max-csn) on all participants.

Then, we commit locally (i.g. mark the current transaction as
committed in clog).

After that, in post-commit phase, execute COMMIT PREPARED 'gid' on all
participants.

As per my current understanding, the overall idea is as follows. For
global transactions, pg_global_snapshot_prepare('gid') will set the
transaction status as InDoubt and generate CSN (let's call it NodeCSN)
at the node where that function is executed, it also returns the
NodeCSN to the coordinator. Then the coordinator (the current
postgres_fdw node on which write transaction is being executed)
computes MaxCSN based on the return value (NodeCSN) of prepare
(pg_global_snapshot_prepare) from all nodes. It then assigns MaxCSN
to each node. Finally, when Commit Prepared is issued for each node
that MaxCSN will be written to each node including the current node.
So, with this idea, each node will have the same view of CSN value
corresponding to any particular transaction.

For Snapshot management, the node which receives the query generates a
CSN (CurrentCSN) and follows the simple rule that the tuple having a
xid with CSN lesser than CurrentCSN will be visible. Now, it is
possible that when we are examining a tuple, the CSN corresponding to
xid that has written the tuple has a value as INDOUBT which will
indicate that the transaction is yet not committed on all nodes. And
we wait till we get the valid CSN value corresponding to xid and then
use it to check if the tuple is visible.

Now, one thing to note here is that for global transactions we
primarily rely on CSN value corresponding to a transaction for its
visibility even though we still maintain CLOG for local transaction
status.

Leaving aside the incomplete parts and or flaws of the current patch,
does the above match the top-level idea of this patch? I am not sure
if my understanding of this patch at this stage is completely correct
or whether we want to follow the approach of this patch but I think at
least lets first be sure if such a top-level idea can achieve what we
want to do here.

Considering how to integrate this global snapshot feature with the 2PC
patch, what the 2PC patch needs to at least change is to allow FDW to
store an FDW-private data that is passed to subsequent FDW transaction
API calls. Currently, in the current 2PC patch, we call Prepare API
for each participant servers one by one, and the core pass only
metadata such as ForeignServer, UserMapping, and global transaction
identifier. So it's not easy to calculate the maximum CSN across
multiple transaction API calls. I think we can change the 2PC patch to
add a void pointer into FdwXactRslvState, struct passed from the core,
in order to store FDW-private data. It's going to be the maximum CSN
in this case. That way, at the first Prepare API calls postgres_fdw
allocates the space and stores CSN to that space. And at subsequent
Prepare API calls it can calculate the maximum of csn, and then is
able to the step 3 to 6 when preparing the transaction on the last
participant. Another idea would be to change 2PC patch so that the
core passes a bunch of participants grouped by FDW.

IIUC with this the coordinator needs the communication with the nodes
twice at the prepare stage, once to prepare the transaction in each
node and get CSN from each node and then to communicate MaxCSN to each
node? Also, we probably need InDoubt CSN status at prepare phase to
make snapshots and global visibility work.

I’ve not read this patch deeply yet and have considered it without any
coding but my first feeling is not hard to integrate this feature with
the 2PC patch.

Okay.

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#16

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

over 5 years ago

In reply to: Amit Kapila (#15)

On Tue, 7 Jul 2020 at 15:40, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Jul 3, 2020 at 12:18 PM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Sat, 20 Jun 2020 at 21:21, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Jun 19, 2020 at 1:42 PM Andrey V. Lepikhov
<a.lepikhov@postgrespro.ru> wrote:

Also, can you let us know if this

supports 2PC in some way and if so how is it different from what the
other thread on the same topic [1] is trying to achieve?

Yes, the patch '0003-postgres_fdw-support-for-global-snapshots' contains
2PC machinery. Now I'd not judge which approach is better.

Sorry for being late.

No problem, your summarization, and comparisons of both approaches are
quite helpful.

I studied this patch and did a simple comparison between this patch
(0002 patch) and my 2PC patch.

In terms of atomic commit, the features that are not implemented in
this patch but in the 2PC patch are:

* Crash safe.
* PREPARE TRANSACTION command support.
* Query cancel during waiting for the commit.
* Automatically in-doubt transaction resolution.

On the other hand, the feature that is implemented in this patch but
not in the 2PC patch is:

* Executing PREPARE TRANSACTION (and other commands) in parallel

When the 2PC patch was proposed, IIRC it was like this patch (0002
patch). I mean, it changed only postgres_fdw to support 2PC. But after
discussion, we changed the approach to have the core manage foreign
transaction for crash-safe. From my perspective, this patch has a
minimum implementation of 2PC to work the global snapshot feature and
has some missing features important for supporting crash-safe atomic
commit. So I personally think we should consider how to integrate this
global snapshot feature with the 2PC patch, rather than improving this
patch if we want crash-safe atomic commit.

Okay, but isn't there some advantage with this approach (manage 2PC at
postgres_fdw level) as well which is that any node will be capable of
handling global transactions rather than doing them via central
coordinator? I mean any node can do writes or reads rather than
probably routing them (at least writes) via coordinator node.

The postgres server where the client started the transaction works as
the coordinator node. I think this is true for both this patch and
that 2PC patch. From the perspective of atomic commit, any node will
be capable of handling global transactions in both approaches.

Now, I
agree that even if this advantage is there in the current approach, we
can't lose the crash-safety aspect of other approach. Will you be
able to summarize what was the problem w.r.t crash-safety and how your
patch has dealt it?

Since this patch proceeds 2PC without any logging, foreign
transactions prepared on foreign servers are left over without any
clues if the coordinator crashes during commit. Therefore, after
restart, the user will need to find and resolve in-doubt foreign
transactions manually.

In that 2PC patch, the information of foreign transactions is WAL
logged before PREPARE TRANSACTION. So even if the coordinator crashes
after preparing some foreign transactions, the prepared foreign
transactions are recovered during crash recovery, and then the
transaction resolver resolves them automatically or the user also can
resolve them. The user doesn't need to check other participants node
to resolve in-doubt foreign transactions. Also, since the foreign
transaction information is replicated to physical standbys the new
master can take over resolving in-doubt transactions.

Looking at the commit procedure with this patch:

When starting a new transaction on a foreign server, postgres_fdw
executes pg_global_snapshot_import() to import the global snapshot.
After some work, in pre-commit phase we do:

1. generate global transaction id, say 'gid'
2. execute PREPARE TRANSACTION 'gid' on all participants.
3. prepare global snapshot locally, if the local node also involves
the transaction
4. execute pg_global_snapshot_prepare('gid') for all participants

During step 2 to 4, we calculate the maximum CSN from the CSNs
returned from each pg_global_snapshot_prepare() executions.

5. assign global snapshot locally, if the local node also involves the
transaction
6. execute pg_global_snapshot_assign('gid', max-csn) on all participants.

Then, we commit locally (i.g. mark the current transaction as
committed in clog).

After that, in post-commit phase, execute COMMIT PREPARED 'gid' on all
participants.

As per my current understanding, the overall idea is as follows. For
global transactions, pg_global_snapshot_prepare('gid') will set the
transaction status as InDoubt and generate CSN (let's call it NodeCSN)
at the node where that function is executed, it also returns the
NodeCSN to the coordinator. Then the coordinator (the current
postgres_fdw node on which write transaction is being executed)
computes MaxCSN based on the return value (NodeCSN) of prepare
(pg_global_snapshot_prepare) from all nodes. It then assigns MaxCSN
to each node. Finally, when Commit Prepared is issued for each node
that MaxCSN will be written to each node including the current node.
So, with this idea, each node will have the same view of CSN value
corresponding to any particular transaction.

For Snapshot management, the node which receives the query generates a
CSN (CurrentCSN) and follows the simple rule that the tuple having a
xid with CSN lesser than CurrentCSN will be visible. Now, it is
possible that when we are examining a tuple, the CSN corresponding to
xid that has written the tuple has a value as INDOUBT which will
indicate that the transaction is yet not committed on all nodes. And
we wait till we get the valid CSN value corresponding to xid and then
use it to check if the tuple is visible.

Now, one thing to note here is that for global transactions we
primarily rely on CSN value corresponding to a transaction for its
visibility even though we still maintain CLOG for local transaction
status.

Leaving aside the incomplete parts and or flaws of the current patch,
does the above match the top-level idea of this patch?

I'm still studying this patch but your understanding seems right to me.

I am not sure
if my understanding of this patch at this stage is completely correct
or whether we want to follow the approach of this patch but I think at
least lets first be sure if such a top-level idea can achieve what we
want to do here.

Considering how to integrate this global snapshot feature with the 2PC
patch, what the 2PC patch needs to at least change is to allow FDW to
store an FDW-private data that is passed to subsequent FDW transaction
API calls. Currently, in the current 2PC patch, we call Prepare API
for each participant servers one by one, and the core pass only
metadata such as ForeignServer, UserMapping, and global transaction
identifier. So it's not easy to calculate the maximum CSN across
multiple transaction API calls. I think we can change the 2PC patch to
add a void pointer into FdwXactRslvState, struct passed from the core,
in order to store FDW-private data. It's going to be the maximum CSN
in this case. That way, at the first Prepare API calls postgres_fdw
allocates the space and stores CSN to that space. And at subsequent
Prepare API calls it can calculate the maximum of csn, and then is
able to the step 3 to 6 when preparing the transaction on the last
participant. Another idea would be to change 2PC patch so that the
core passes a bunch of participants grouped by FDW.

IIUC with this the coordinator needs the communication with the nodes
twice at the prepare stage, once to prepare the transaction in each
node and get CSN from each node and then to communicate MaxCSN to each
node?

Yes, I think so too.

Also, we probably need InDoubt CSN status at prepare phase to
make snapshots and global visibility work.

I think it depends on how global CSN feature works.

For instance, in that 2PC patch, if the coordinator crashes during
preparing a foreign transaction, the global transaction manager
recovers and regards it as "prepared" regardless of the foreign
transaction actually having been prepared. And it sends ROLLBACK
PREPARED after recovery completed. With global CSN patch, as you
mentioned, at prepare phase the coordinator needs to communicate
participants twice other than sending PREPARE TRANSACTION:
pg_global_snapshot_prepare() and pg_global_snapshot_assign().

If global CSN patch needs different cleanup work depending on the CSN
status, we will need InDoubt CSN status so that the global transaction
manager can distinguish between a foreign transaction that has
executed pg_global_snapshot_prepare() and the one that has executed
pg_global_snapshot_assign().

On the other hand, if it's enough to just send ROLLBACK or ROLLBACK
PREPARED in that case, I think we don't need InDoubt CSN status. There
is no difference between those foreign transactions from the global
transaction manager perspective.

As far as I read the patch, on failure postgres_fdw simply send
ROLLBACK PREPARED to participants, and there seems no additional work
other than that. I might be missing something.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#17

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Masahiko Sawada (#16)

On Wed, Jul 8, 2020 at 11:16 AM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Tue, 7 Jul 2020 at 15:40, Amit Kapila <amit.kapila16@gmail.com> wrote:

Okay, but isn't there some advantage with this approach (manage 2PC at
postgres_fdw level) as well which is that any node will be capable of
handling global transactions rather than doing them via central
coordinator? I mean any node can do writes or reads rather than
probably routing them (at least writes) via coordinator node.

The postgres server where the client started the transaction works as
the coordinator node. I think this is true for both this patch and
that 2PC patch. From the perspective of atomic commit, any node will
be capable of handling global transactions in both approaches.

Okay, but then probably we need to ensure that GID has to be unique
even if that gets generated on different nodes? I don't know if that
is ensured.

Now, I
agree that even if this advantage is there in the current approach, we
can't lose the crash-safety aspect of other approach. Will you be
able to summarize what was the problem w.r.t crash-safety and how your
patch has dealt it?

Since this patch proceeds 2PC without any logging, foreign
transactions prepared on foreign servers are left over without any
clues if the coordinator crashes during commit. Therefore, after
restart, the user will need to find and resolve in-doubt foreign
transactions manually.

Okay, but is it because we can't directly WAL log in postgres_fdw or
there is some other reason for not doing so?

Looking at the commit procedure with this patch:

When starting a new transaction on a foreign server, postgres_fdw
executes pg_global_snapshot_import() to import the global snapshot.
After some work, in pre-commit phase we do:

1. generate global transaction id, say 'gid'
2. execute PREPARE TRANSACTION 'gid' on all participants.
3. prepare global snapshot locally, if the local node also involves
the transaction
4. execute pg_global_snapshot_prepare('gid') for all participants

During step 2 to 4, we calculate the maximum CSN from the CSNs
returned from each pg_global_snapshot_prepare() executions.

5. assign global snapshot locally, if the local node also involves the
transaction
6. execute pg_global_snapshot_assign('gid', max-csn) on all participants.

Then, we commit locally (i.g. mark the current transaction as
committed in clog).

After that, in post-commit phase, execute COMMIT PREPARED 'gid' on all
participants.

As per my current understanding, the overall idea is as follows. For
global transactions, pg_global_snapshot_prepare('gid') will set the
transaction status as InDoubt and generate CSN (let's call it NodeCSN)
at the node where that function is executed, it also returns the
NodeCSN to the coordinator. Then the coordinator (the current
postgres_fdw node on which write transaction is being executed)
computes MaxCSN based on the return value (NodeCSN) of prepare
(pg_global_snapshot_prepare) from all nodes. It then assigns MaxCSN
to each node. Finally, when Commit Prepared is issued for each node
that MaxCSN will be written to each node including the current node.
So, with this idea, each node will have the same view of CSN value
corresponding to any particular transaction.

For Snapshot management, the node which receives the query generates a
CSN (CurrentCSN) and follows the simple rule that the tuple having a
xid with CSN lesser than CurrentCSN will be visible. Now, it is
possible that when we are examining a tuple, the CSN corresponding to
xid that has written the tuple has a value as INDOUBT which will
indicate that the transaction is yet not committed on all nodes. And
we wait till we get the valid CSN value corresponding to xid and then
use it to check if the tuple is visible.

Now, one thing to note here is that for global transactions we
primarily rely on CSN value corresponding to a transaction for its
visibility even though we still maintain CLOG for local transaction
status.

Leaving aside the incomplete parts and or flaws of the current patch,
does the above match the top-level idea of this patch?

I'm still studying this patch but your understanding seems right to me.

Cool. While studying, if you can try to think whether this approach is
different from the global coordinator based approach then it would be
great. Here is my initial thought apart from other reasons the global
coordinator based design can help us to do the global transaction
management and snapshots. It can allocate xids for each transaction
and then collect the list of running xacts (or CSN) from each node and
then prepare a global snapshot that can be used to perform any
transaction.

OTOH, in the design proposed in this patch, we don't need any
coordinator to manage transactions and snapshots because each node's
current CSN will be sufficient for snapshot and visibility as
explained above. Now, sure this assumes that there is no clock skew
on different nodes or somehow we take care of the same (Note that in
the proposed patch the CSN is a timestamp.).

I am not sure
if my understanding of this patch at this stage is completely correct
or whether we want to follow the approach of this patch but I think at
least lets first be sure if such a top-level idea can achieve what we
want to do here.

Considering how to integrate this global snapshot feature with the 2PC
patch, what the 2PC patch needs to at least change is to allow FDW to
store an FDW-private data that is passed to subsequent FDW transaction
API calls. Currently, in the current 2PC patch, we call Prepare API
for each participant servers one by one, and the core pass only
metadata such as ForeignServer, UserMapping, and global transaction
identifier. So it's not easy to calculate the maximum CSN across
multiple transaction API calls. I think we can change the 2PC patch to
add a void pointer into FdwXactRslvState, struct passed from the core,
in order to store FDW-private data. It's going to be the maximum CSN
in this case. That way, at the first Prepare API calls postgres_fdw
allocates the space and stores CSN to that space. And at subsequent
Prepare API calls it can calculate the maximum of csn, and then is
able to the step 3 to 6 when preparing the transaction on the last
participant. Another idea would be to change 2PC patch so that the
core passes a bunch of participants grouped by FDW.

IIUC with this the coordinator needs the communication with the nodes
twice at the prepare stage, once to prepare the transaction in each
node and get CSN from each node and then to communicate MaxCSN to each
node?

Yes, I think so too.

Also, we probably need InDoubt CSN status at prepare phase to
make snapshots and global visibility work.

I think it depends on how global CSN feature works.

For instance, in that 2PC patch, if the coordinator crashes during
preparing a foreign transaction, the global transaction manager
recovers and regards it as "prepared" regardless of the foreign
transaction actually having been prepared. And it sends ROLLBACK
PREPARED after recovery completed. With global CSN patch, as you
mentioned, at prepare phase the coordinator needs to communicate
participants twice other than sending PREPARE TRANSACTION:
pg_global_snapshot_prepare() and pg_global_snapshot_assign().

If global CSN patch needs different cleanup work depending on the CSN
status, we will need InDoubt CSN status so that the global transaction
manager can distinguish between a foreign transaction that has
executed pg_global_snapshot_prepare() and the one that has executed
pg_global_snapshot_assign().

On the other hand, if it's enough to just send ROLLBACK or ROLLBACK
PREPARED in that case, I think we don't need InDoubt CSN status. There
is no difference between those foreign transactions from the global
transaction manager perspective.

I think InDoubt status helps in checking visibility in the proposed
patch wherein if we find the status of the transaction as InDoubt, we
wait till we get some valid CSN for it as explained in my previous
email. So whether we use it for Rollback/Rollback Prepared, it is
required for this design.

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#18

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

over 5 years ago

In reply to: Amit Kapila (#17)

On Wed, 8 Jul 2020 at 21:35, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Wed, Jul 8, 2020 at 11:16 AM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Tue, 7 Jul 2020 at 15:40, Amit Kapila <amit.kapila16@gmail.com> wrote:

Okay, but isn't there some advantage with this approach (manage 2PC at
postgres_fdw level) as well which is that any node will be capable of
handling global transactions rather than doing them via central
coordinator? I mean any node can do writes or reads rather than
probably routing them (at least writes) via coordinator node.

The postgres server where the client started the transaction works as
the coordinator node. I think this is true for both this patch and
that 2PC patch. From the perspective of atomic commit, any node will
be capable of handling global transactions in both approaches.

Okay, but then probably we need to ensure that GID has to be unique
even if that gets generated on different nodes? I don't know if that
is ensured.

Yes, if you mean GID is global transaction id specified to PREPARE
TRANSACTION, it has to be unique. In that 2PC patch, GID is generated
in form of 'fx_<random string>_<server oid>_<user oid>'. I believe it
can ensure uniqueness in most cases. In addition, there is FDW API to
generate an arbitrary identifier.

Now, I
agree that even if this advantage is there in the current approach, we
can't lose the crash-safety aspect of other approach. Will you be
able to summarize what was the problem w.r.t crash-safety and how your
patch has dealt it?

Since this patch proceeds 2PC without any logging, foreign
transactions prepared on foreign servers are left over without any
clues if the coordinator crashes during commit. Therefore, after
restart, the user will need to find and resolve in-doubt foreign
transactions manually.

Okay, but is it because we can't directly WAL log in postgres_fdw or
there is some other reason for not doing so?

Yes, I think it is because we cannot WAL log in postgres_fdw. Maybe I
missed the point in your question. Please correct me if I missed
something.

Looking at the commit procedure with this patch:

When starting a new transaction on a foreign server, postgres_fdw
executes pg_global_snapshot_import() to import the global snapshot.
After some work, in pre-commit phase we do:

1. generate global transaction id, say 'gid'
2. execute PREPARE TRANSACTION 'gid' on all participants.
3. prepare global snapshot locally, if the local node also involves
the transaction
4. execute pg_global_snapshot_prepare('gid') for all participants

During step 2 to 4, we calculate the maximum CSN from the CSNs
returned from each pg_global_snapshot_prepare() executions.

5. assign global snapshot locally, if the local node also involves the
transaction
6. execute pg_global_snapshot_assign('gid', max-csn) on all participants.

Then, we commit locally (i.g. mark the current transaction as
committed in clog).

After that, in post-commit phase, execute COMMIT PREPARED 'gid' on all
participants.

As per my current understanding, the overall idea is as follows. For
global transactions, pg_global_snapshot_prepare('gid') will set the
transaction status as InDoubt and generate CSN (let's call it NodeCSN)
at the node where that function is executed, it also returns the
NodeCSN to the coordinator. Then the coordinator (the current
postgres_fdw node on which write transaction is being executed)
computes MaxCSN based on the return value (NodeCSN) of prepare
(pg_global_snapshot_prepare) from all nodes. It then assigns MaxCSN
to each node. Finally, when Commit Prepared is issued for each node
that MaxCSN will be written to each node including the current node.
So, with this idea, each node will have the same view of CSN value
corresponding to any particular transaction.

For Snapshot management, the node which receives the query generates a
CSN (CurrentCSN) and follows the simple rule that the tuple having a
xid with CSN lesser than CurrentCSN will be visible. Now, it is
possible that when we are examining a tuple, the CSN corresponding to
xid that has written the tuple has a value as INDOUBT which will
indicate that the transaction is yet not committed on all nodes. And
we wait till we get the valid CSN value corresponding to xid and then
use it to check if the tuple is visible.

Now, one thing to note here is that for global transactions we
primarily rely on CSN value corresponding to a transaction for its
visibility even though we still maintain CLOG for local transaction
status.

Leaving aside the incomplete parts and or flaws of the current patch,
does the above match the top-level idea of this patch?

I'm still studying this patch but your understanding seems right to me.

Cool. While studying, if you can try to think whether this approach is
different from the global coordinator based approach then it would be
great. Here is my initial thought apart from other reasons the global
coordinator based design can help us to do the global transaction
management and snapshots. It can allocate xids for each transaction
and then collect the list of running xacts (or CSN) from each node and
then prepare a global snapshot that can be used to perform any
transaction. OTOH, in the design proposed in this patch, we don't need any
coordinator to manage transactions and snapshots because each node's
current CSN will be sufficient for snapshot and visibility as
explained above.

Yeah, my thought is the same as you. Since both approaches have strong
points and weak points I cannot mention which is a better approach,
but that 2PC patch would go well together with the design proposed in
this patch.

Now, sure this assumes that there is no clock skew
on different nodes or somehow we take care of the same (Note that in
the proposed patch the CSN is a timestamp.).

As far as I read Clock-SI paper, we take care of the clock skew by
putting some waits on the transaction start and reading tuples on the
remote node.

I am not sure
if my understanding of this patch at this stage is completely correct
or whether we want to follow the approach of this patch but I think at
least lets first be sure if such a top-level idea can achieve what we
want to do here.

Considering how to integrate this global snapshot feature with the 2PC
patch, what the 2PC patch needs to at least change is to allow FDW to
store an FDW-private data that is passed to subsequent FDW transaction
API calls. Currently, in the current 2PC patch, we call Prepare API
for each participant servers one by one, and the core pass only
metadata such as ForeignServer, UserMapping, and global transaction
identifier. So it's not easy to calculate the maximum CSN across
multiple transaction API calls. I think we can change the 2PC patch to
add a void pointer into FdwXactRslvState, struct passed from the core,
in order to store FDW-private data. It's going to be the maximum CSN
in this case. That way, at the first Prepare API calls postgres_fdw
allocates the space and stores CSN to that space. And at subsequent
Prepare API calls it can calculate the maximum of csn, and then is
able to the step 3 to 6 when preparing the transaction on the last
participant. Another idea would be to change 2PC patch so that the
core passes a bunch of participants grouped by FDW.

IIUC with this the coordinator needs the communication with the nodes
twice at the prepare stage, once to prepare the transaction in each
node and get CSN from each node and then to communicate MaxCSN to each
node?

Yes, I think so too.

Also, we probably need InDoubt CSN status at prepare phase to
make snapshots and global visibility work.

I think it depends on how global CSN feature works.

For instance, in that 2PC patch, if the coordinator crashes during
preparing a foreign transaction, the global transaction manager
recovers and regards it as "prepared" regardless of the foreign
transaction actually having been prepared. And it sends ROLLBACK
PREPARED after recovery completed. With global CSN patch, as you
mentioned, at prepare phase the coordinator needs to communicate
participants twice other than sending PREPARE TRANSACTION:
pg_global_snapshot_prepare() and pg_global_snapshot_assign().

If global CSN patch needs different cleanup work depending on the CSN
status, we will need InDoubt CSN status so that the global transaction
manager can distinguish between a foreign transaction that has
executed pg_global_snapshot_prepare() and the one that has executed
pg_global_snapshot_assign().

On the other hand, if it's enough to just send ROLLBACK or ROLLBACK
PREPARED in that case, I think we don't need InDoubt CSN status. There
is no difference between those foreign transactions from the global
transaction manager perspective.

I think InDoubt status helps in checking visibility in the proposed
patch wherein if we find the status of the transaction as InDoubt, we
wait till we get some valid CSN for it as explained in my previous
email. So whether we use it for Rollback/Rollback Prepared, it is
required for this design.

Yes, InDoubt status is required for checking visibility. My comment
was it's not necessary from the perspective of atomic commit.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#19

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Masahiko Sawada (#18)

On Fri, Jul 10, 2020 at 8:46 AM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Wed, 8 Jul 2020 at 21:35, Amit Kapila <amit.kapila16@gmail.com> wrote:

Cool. While studying, if you can try to think whether this approach is
different from the global coordinator based approach then it would be
great. Here is my initial thought apart from other reasons the global
coordinator based design can help us to do the global transaction
management and snapshots. It can allocate xids for each transaction
and then collect the list of running xacts (or CSN) from each node and
then prepare a global snapshot that can be used to perform any
transaction. OTOH, in the design proposed in this patch, we don't need any
coordinator to manage transactions and snapshots because each node's
current CSN will be sufficient for snapshot and visibility as
explained above.

Yeah, my thought is the same as you. Since both approaches have strong
points and weak points I cannot mention which is a better approach,
but that 2PC patch would go well together with the design proposed in
this patch.

I also think with some modifications we might be able to integrate
your 2PC patch with the patches proposed here. However, if we decide
not to pursue this approach then it is uncertain whether your proposed
patch can be further enhanced for global visibility. Does it make
sense to dig the design of this approach a bit further so that we can
be somewhat more sure that pursuing your 2PC patch would be a good
idea and we can, in fact, enhance it later for global visibility?
AFAICS, Andrey has mentioned couple of problems with this approach
[1]: /messages/by-id/f23083b9-38d0-6126-eb6e-091816a78585@postgrespro.ru
can dig those it would be really great.

Now, sure this assumes that there is no clock skew
on different nodes or somehow we take care of the same (Note that in
the proposed patch the CSN is a timestamp.).

As far as I read Clock-SI paper, we take care of the clock skew by
putting some waits on the transaction start and reading tuples on the
remote node.

Oh, but I am not sure if this patch is able to solve that, and if so, how?

I think InDoubt status helps in checking visibility in the proposed
patch wherein if we find the status of the transaction as InDoubt, we
wait till we get some valid CSN for it as explained in my previous
email. So whether we use it for Rollback/Rollback Prepared, it is
required for this design.

Yes, InDoubt status is required for checking visibility. My comment
was it's not necessary from the perspective of atomic commit.

True and probably we can enhance your patch for InDoubt status if required.

Thanks for moving this work forward. I know the progress is a bit
slow due to various reasons but I think it is important to keep making
some progress.

[1]: /messages/by-id/f23083b9-38d0-6126-eb6e-091816a78585@postgrespro.ru

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#20

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: Amit Kapila (#19)

RE: Global snapshots

Hello,

While I'm thinking of the following issues of the current approach Andrey raised, I'm getting puzzled and can't help asking certain things. Please forgive me if I'm missing some discussions in the past.

1. Dependency on clocks synchronization
2. Needs guarantees of monotonically increasing of the CSN in the case
of an instance restart/crash etc.
3. We need to delay increasing of OldestXmin because it can be needed
for a transaction snapshot at another node.

While Clock-SI seems to be considered the best promising for global serializability here,

* Why does Clock-SI gets so much attention? How did Clock-SI become the only choice?

* Clock-SI was devised in Microsoft Research. Does Microsoft or some other organization use Clock-SI?

Have anyone examined the following Multiversion Commitment Ordering (MVCO)? Although I haven't understood this yet, it insists that no concurrency control information including timestamps needs to be exchanged among the cluster nodes. I'd appreciate it if someone could give an opinion.

Commitment Ordering Based Distributed Concurrency Control for Bridging Single and Multi Version Resources.
Proceedings of the Third IEEE International Workshop on Research Issues on Data Engineering: Interoperability in Multidatabase Systems (RIDE-IMS), Vienna, Austria, pp. 189-198, April 1993. (also DEC-TR 853, July 1992)
https://ieeexplore.ieee.org/document/281924?arnumber=281924

The author of the above paper, Yoav Raz, seems to have had strong passion at least until 2011 about making people believe the mightiness of Commitment Ordering (CO) for global serializability. However, he complains (sadly) that almost all researchers ignore his theory, as written in his following site and wikipedia page for Commitment Ordering. Does anyone know why CO is ignored?

Commitment ordering (CO) - yoavraz2
https://sites.google.com/site/yoavraz2/the_principle_of_co

FWIW, some researchers including Michael Stonebraker evaluated the performance of various distributed concurrency control methods in 2017. Have anyone looked at this? (I don't mean there was some promising method that we might want to adopt.)

An Evaluation of Distributed Concurrency Control
Rachael Harding, Dana Van Aken, Andrew Pavlo, and Michael Stonebraker. 2017.
Proc. VLDB Endow. 10, 5 (January 2017), 553-564.
https://doi.org/10.14778/3055540.3055548

Regards
Takayuki Tsunakawa

#21

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

over 5 years ago

In reply to: Amit Kapila (#19)

On Mon, 13 Jul 2020 at 20:18, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Jul 10, 2020 at 8:46 AM Masahiko Sawada
<masahiko.sawada@2ndquadrant.com> wrote:

On Wed, 8 Jul 2020 at 21:35, Amit Kapila <amit.kapila16@gmail.com> wrote:

Cool. While studying, if you can try to think whether this approach is
different from the global coordinator based approach then it would be
great. Here is my initial thought apart from other reasons the global
coordinator based design can help us to do the global transaction
management and snapshots. It can allocate xids for each transaction
and then collect the list of running xacts (or CSN) from each node and
then prepare a global snapshot that can be used to perform any
transaction. OTOH, in the design proposed in this patch, we don't need any
coordinator to manage transactions and snapshots because each node's
current CSN will be sufficient for snapshot and visibility as
explained above.

Yeah, my thought is the same as you. Since both approaches have strong
points and weak points I cannot mention which is a better approach,
but that 2PC patch would go well together with the design proposed in
this patch.

I also think with some modifications we might be able to integrate
your 2PC patch with the patches proposed here. However, if we decide
not to pursue this approach then it is uncertain whether your proposed
patch can be further enhanced for global visibility.

Yes. I think even if we decide not to pursue this approach it's not
the reason for not pursuing the 2PC patch. if so we would need to
consider the design of 2PC patch again so it generically resolves the
atomic commit problem.

Does it make
sense to dig the design of this approach a bit further so that we can
be somewhat more sure that pursuing your 2PC patch would be a good
idea and we can, in fact, enhance it later for global visibility?

Agreed.

AFAICS, Andrey has mentioned couple of problems with this approach
[1], the details of which I am also not sure at this stage but if we
can dig those it would be really great.

Now, sure this assumes that there is no clock skew
on different nodes or somehow we take care of the same (Note that in
the proposed patch the CSN is a timestamp.).

As far as I read Clock-SI paper, we take care of the clock skew by
putting some waits on the transaction start and reading tuples on the
remote node.

Oh, but I am not sure if this patch is able to solve that, and if so, how?

I'm not sure the details but, as far as I read the patch I guess the
transaction will sleep at GlobalSnapshotSync() when the received
global csn is greater than the local global csn.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#22

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#20)

RE: Global snapshots

Hi Andrey san, Movead san,

From: tsunakawa.takay@fujitsu.com <tsunakawa.takay@fujitsu.com>

While Clock-SI seems to be considered the best promising for global
serializability here,

* Why does Clock-SI gets so much attention? How did Clock-SI become the
only choice?

* Clock-SI was devised in Microsoft Research. Does Microsoft or some other
organization use Clock-SI?

Could you take a look at this patent? I'm afraid this is the Clock-SI for MVCC. Microsoft holds this until 2031. I couldn't find this with the keyword "Clock-SI.""

US8356007B2 - Distributed transaction management for database systems with multiversioning - Google Patents
https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Regards
Takayuki Tsunakawa

#23

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#22)

On 7/27/20 11:22 AM, tsunakawa.takay@fujitsu.com wrote:

Hi Andrey san, Movead san,

From: tsunakawa.takay@fujitsu.com <tsunakawa.takay@fujitsu.com>

While Clock-SI seems to be considered the best promising for global
serializability here,

* Why does Clock-SI gets so much attention? How did Clock-SI become the
only choice?

* Clock-SI was devised in Microsoft Research. Does Microsoft or some other
organization use Clock-SI?

Could you take a look at this patent? I'm afraid this is the Clock-SI for MVCC. Microsoft holds this until 2031. I couldn't find this with the keyword "Clock-SI.""

US8356007B2 - Distributed transaction management for database systems with multiversioning - Google Patents
https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Regards
Takayuki Tsunakawa

Thank you for the research (and previous links too).
I haven't seen this patent before. This should be carefully studied.

--
regards,
Andrey Lepikhov
Postgres Professional

#24

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Andrey V. Lepikhov (#23)

1 attachment(s)

Hi,

On 2020-07-27 09:44, Andrey V. Lepikhov wrote:

On 7/27/20 11:22 AM, tsunakawa.takay@fujitsu.com wrote:

US8356007B2 - Distributed transaction management for database systems
with multiversioning - Google Patents
https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Thank you for the research (and previous links too).
I haven't seen this patent before. This should be carefully studied.

I had a look on the patch set, although it is quite outdated, especially
on 0003.

Two thoughts about 0003:

First, IIUC atomicity of the distributed transaction in the postgres_fdw
is achieved by the usage of 2PC. I think that this postgres_fdw 2PC
support should be separated from global snapshots. It could be useful to
have such atomic distributed transactions even without a proper
visibility, which is guaranteed by the global snapshot. Especially
taking into account the doubts about Clock-SI and general questions
about algorithm choosing criteria above in the thread.

Thus, I propose to split 0003 into two parts and add a separate GUC
'postgres_fdw.use_twophase', which could be turned on independently from
'postgres_fdw.use_global_snapshots'. Of course if the latter is enabled,
then 2PC should be forcedly turned on as well.

Second, there are some problems with errors handling in the 0003 (thanks
to Arseny Sher for review).

+error:
+			if (!res)
+			{
+				sql = psprintf("ABORT PREPARED '%s'", fdwTransState->gid);
+				BroadcastCmd(sql);
+				elog(ERROR, "Failed to PREPARE transaction on remote node");
+			}

It seems that we should never reach this point, just because
BroadcastStmt will throw an ERROR if it fails to prepare transaction on
the foreign server:

+			if (PQresultStatus(result) != expectedStatus ||
+				(handler && !handler(result, arg)))
+			{
+				elog(WARNING, "Failed command %s: status=%d, expected status=%d", 
sql, PQresultStatus(result), expectedStatus);
+				pgfdw_report_error(ERROR, result, entry->conn, true, sql);
+				allOk = false;
+			}

Moreover, It doesn't make much sense to try to abort prepared xacts,
since if we failed to prepare it somewhere, then some foreign servers
may become unavailable already and this doesn't provide us a 100%
guarantee of clean up.

+	/* COMMIT open transaction of we were doing 2PC */
+	if (fdwTransState->two_phase_commit &&
+		(event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+	{
+		BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid));
+	}

At this point, the host (local) transaction is already committed and
there is no way to abort it gracefully. However, BroadcastCmd may rise
an ERROR that will cause a PANIC, since it is non-recoverable state:

PANIC: cannot abort transaction 487, it was already committed

Attached is a patch, which implements a plain 2PC in the postgres_fdw
and adds a GUC 'postgres_fdw.use_twophase'. Also it solves these errors
handling issues above and tries to add proper comments everywhere. I
think, that 0003 should be rebased on the top of it, or it could be a
first patch in the set, since it may be used independently. What do you
think?

Regards
--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

Attachments:

0001-Add-postgres_fdw.use_twophase-GUC-to-use-2PC.patchtext/x-diff; name=0001-Add-postgres_fdw.use_twophase-GUC-to-use-2PC.patchDownload

From debdffade7abcdbf29031bda6c8359a89776ad36 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 7 Aug 2020 16:50:57 +0300
Subject: [PATCH] Add postgres_fdw.use_twophase GUC to use 2PC for transactions
 involving several servers.

---
 contrib/postgres_fdw/connection.c   | 234 +++++++++++++++++++++++++---
 contrib/postgres_fdw/postgres_fdw.c |  17 ++
 contrib/postgres_fdw/postgres_fdw.h |   2 +
 3 files changed, 228 insertions(+), 25 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 08daf26fdf0..d18fdd1f94e 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -66,6 +66,20 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/*
+ * FdwTransactionState
+ *
+ * Holds number of open remote transactions and shared state
+ * needed for all connection entries.
+ */
+typedef struct FdwTransactionState
+{
+	char	   *gid;
+	int			nparticipants;
+	bool		two_phase_commit;
+} FdwTransactionState;
+static FdwTransactionState *fdwTransState;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -73,6 +87,9 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/* counter of prepared tx made by this backend */
+static int two_phase_xact_count = 0;
+
 /* prototypes of private functions */
 static PGconn *connect_pg_server(ForeignServer *server, UserMapping *user);
 static void disconnect_pg_server(ConnCacheEntry *entry);
@@ -81,6 +98,7 @@ static void configure_remote_session(PGconn *conn);
 static void do_sql_command(PGconn *conn, const char *sql);
 static void begin_remote_xact(ConnCacheEntry *entry);
 static void pgfdw_xact_callback(XactEvent event, void *arg);
+static void deallocate_prepared_stmts(ConnCacheEntry *entry);
 static void pgfdw_subxact_callback(SubXactEvent event,
 								   SubTransactionId mySubid,
 								   SubTransactionId parentSubid,
@@ -137,6 +155,16 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 									  pgfdw_inval_callback, (Datum) 0);
 	}
 
+	/* Allocate FdwTransactionState */
+	if (fdwTransState == NULL)
+	{
+		MemoryContext oldcxt;
+		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+		fdwTransState = palloc0(sizeof(FdwTransactionState));
+		fdwTransState->nparticipants = 0;
+		MemoryContextSwitchTo(oldcxt);
+	}
+
 	/* Set flag that we did GetConnection during the current transaction */
 	xact_got_connection = true;
 
@@ -448,7 +476,8 @@ configure_remote_session(PGconn *conn)
 }
 
 /*
- * Convenience subroutine to issue a non-data-returning SQL command to remote
+ * Convenience subroutine to issue a non-data-returning SQL command or
+ * statement to remote node.
  */
 static void
 do_sql_command(PGconn *conn, const char *sql)
@@ -494,6 +523,8 @@ begin_remote_xact(ConnCacheEntry *entry)
 		do_sql_command(entry->conn, sql);
 		entry->xact_depth = 1;
 		entry->changing_xact_state = false;
+
+		fdwTransState->nparticipants += 1;
 	}
 
 	/*
@@ -701,6 +732,76 @@ pgfdw_report_error(int elevel, PGresult *res, PGconn *conn,
 	PG_END_TRY();
 }
 
+/* Callback typedef for BroadcastStmt */
+typedef bool (*BroadcastCmdResHandler) (PGresult *result, void *arg);
+
+/*
+ * Broadcast sql in parallel to all ConnectionHash entries.
+ *
+ * In the case of elevel < ERROR and error occured only a elevel message
+ * will be rised and 0 (false) will be returned as a return code.  That way,
+ * it will be up to the caller to handle this situation gracefully.
+ */
+static bool
+BroadcastStmt(char const * sql, unsigned expectedStatus,
+			  int elevel, BroadcastCmdResHandler handler,
+			  void *arg)
+{
+	HASH_SEQ_STATUS scan;
+	ConnCacheEntry *entry;
+	bool		allOk = true;
+
+	/* Broadcast sql */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		pgfdw_reject_incomplete_xact_state_change(entry);
+
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			if (!PQsendQuery(entry->conn, sql))
+			{
+				PGresult   *res = PQgetResult(entry->conn);
+
+				elog(elevel < ERROR ? elevel : WARNING, "failed to send command %s", sql);
+				pgfdw_report_error(elevel, res, entry->conn, true, sql);
+				PQclear(res);
+			}
+		}
+	}
+
+	/* Collect responses */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			PGresult   *result = PQgetResult(entry->conn);
+
+			if (PQresultStatus(result) != expectedStatus ||
+				(handler && !handler(result, arg)))
+			{
+				elog(elevel < ERROR ? elevel : WARNING,
+					 "failed command %s: status=%d, expected status=%d",
+					 sql, PQresultStatus(result), expectedStatus);
+				pgfdw_report_error(elevel, result, entry->conn, true, sql);
+				allOk = false;
+			}
+			PQclear(result);
+			PQgetResult(entry->conn);	/* consume NULL result */
+		}
+	}
+
+	return allOk;
+}
+
+/* Wrapper for broadcasting commands */
+static bool
+BroadcastCmd(char const *sql, int elevel)
+{
+	return BroadcastStmt(sql, PGRES_COMMAND_OK, elevel, NULL, NULL);
+}
+
 /*
  * pgfdw_xact_callback --- cleanup at main-transaction end.
  */
@@ -714,6 +815,74 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	if (!xact_got_connection)
 		return;
 
+	/*
+	 * On PRE_COMMIT event we should figure out whether to use 2PC or not.
+	 * This decision is based on two factors:
+	 *    # postgres_fdw.use_twophase is turned on;
+	 *    # more than one server have participated in this transaction.
+	 *
+	 * If we decide to use 2PC for this xact, then we should broadcast
+	 * PREPARE to all participated foreign servers.
+	 */
+	if (event == XACT_EVENT_PARALLEL_PRE_COMMIT || event == XACT_EVENT_PRE_COMMIT)
+	{
+		/* Should we take into account this node? */
+		if (TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+			fdwTransState->nparticipants += 1;
+
+		/* Switch to 2PC mode if there were more than one participant */
+		if (Use2PC && fdwTransState->nparticipants > 1)
+			fdwTransState->two_phase_commit = true;
+
+		if (fdwTransState->two_phase_commit)
+		{
+			char   *sql;
+
+			fdwTransState->gid = psprintf("pgfdw:%lld:%llu:%d:%u:%d:%d",
+										  (long long) GetCurrentTimestamp(),
+										  (long long) GetSystemIdentifier(),
+										  MyProcPid,
+										  GetCurrentTransactionIdIfAny(),
+										  ++two_phase_xact_count,
+										  fdwTransState->nparticipants);
+
+			/* Broadcast PREPARE */
+			sql = psprintf("PREPARE TRANSACTION '%s'", fdwTransState->gid);
+
+			/*
+			 * If we got any problem, then it does not make much sence to
+			 * broadcast ABORT PREPARED in order to clean up prepared xacts
+			 * everywhere, since this method does not guarantee a 100%
+			 * success.  This is a work for external tools.  Rise an ERROR
+			 * immediately in the case of failure during broadcast.
+			 */
+			BroadcastCmd(sql, ERROR);
+
+			/*
+			 * Do not fall down. Consequent COMMIT event will clean things up.
+			 */
+			return;
+		}
+	}
+
+	/*
+	 * COMMIT event occurs when the local transaction is fully committed.
+	 * That way, we have to broadcast COMMIT PREPARED to all participated
+	 * foreign servers in order to finalize this 'distributed' transaction.
+	 * Actually, it is too late to abort the host transaction if any error
+	 * occurs during COMMIT PREPARED broadcast stage.
+	 */
+	if (fdwTransState->two_phase_commit &&
+		(event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+	{
+		if (!BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid), WARNING))
+			ereport(WARNING,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("canceling the wait for foreign servers to commit prepared transaction due to the error occured on one of them"),
+					 errdetail("The transaction has already committed locally, but might not have been committed on all participated foreign servers."),
+					 errhint("Consider committing it everywhere manually with COMMIT PREPARED '%s'", fdwTransState->gid)));
+	}
+
 	/*
 	 * Scan all connection cache entries to find open remote transactions, and
 	 * close them.
@@ -721,8 +890,6 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	hash_seq_init(&scan, ConnectionHash);
 	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
 	{
-		PGresult   *res;
-
 		/* Ignore cache entry if no open connection right now */
 		if (entry->conn == NULL)
 			continue;
@@ -739,6 +906,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 			{
 				case XACT_EVENT_PARALLEL_PRE_COMMIT:
 				case XACT_EVENT_PRE_COMMIT:
+					Assert(!fdwTransState->two_phase_commit);
 
 					/*
 					 * If abort cleanup previously failed for this connection,
@@ -751,28 +919,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					do_sql_command(entry->conn, "COMMIT TRANSACTION");
 					entry->changing_xact_state = false;
 
-					/*
-					 * If there were any errors in subtransactions, and we
-					 * made prepared statements, do a DEALLOCATE ALL to make
-					 * sure we get rid of all prepared statements. This is
-					 * annoying and not terribly bulletproof, but it's
-					 * probably not worth trying harder.
-					 *
-					 * DEALLOCATE ALL only exists in 8.3 and later, so this
-					 * constrains how old a server postgres_fdw can
-					 * communicate with.  We intentionally ignore errors in
-					 * the DEALLOCATE, so that we can hobble along to some
-					 * extent with older servers (leaking prepared statements
-					 * as we go; but we don't really support update operations
-					 * pre-8.3 anyway).
-					 */
-					if (entry->have_prep_stmt && entry->have_error)
-					{
-						res = PQexec(entry->conn, "DEALLOCATE ALL");
-						PQclear(res);
-					}
-					entry->have_prep_stmt = false;
-					entry->have_error = false;
+					deallocate_prepared_stmts(entry);
 					break;
 				case XACT_EVENT_PRE_PREPARE:
 
@@ -791,6 +938,11 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					break;
 				case XACT_EVENT_PARALLEL_COMMIT:
 				case XACT_EVENT_COMMIT:
+					if (fdwTransState->two_phase_commit)
+						deallocate_prepared_stmts(entry);
+					else /* Pre-commit should have closed the open transaction */
+						elog(ERROR, "missed cleaning up connection during pre-commit");
+					break;
 				case XACT_EVENT_PREPARE:
 					/* Pre-commit should have closed the open transaction */
 					elog(ERROR, "missed cleaning up connection during pre-commit");
@@ -886,6 +1038,38 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Reset fdwTransState */
+	memset(fdwTransState, '\0', sizeof(FdwTransactionState));
+}
+
+/*
+ * If there were any errors in subtransactions, and we
+ * made prepared statements, do a DEALLOCATE ALL to make
+ * sure we get rid of all prepared statements. This is
+ * annoying and not terribly bulletproof, but it's
+ * probably not worth trying harder.
+ *
+ * DEALLOCATE ALL only exists in 8.3 and later, so this
+ * constrains how old a server postgres_fdw can
+ * communicate with.  We intentionally ignore errors in
+ * the DEALLOCATE, so that we can hobble along to some
+ * extent with older servers (leaking prepared statements
+ * as we go; but we don't really support update operations
+ * pre-8.3 anyway).
+ */
+static void
+deallocate_prepared_stmts(ConnCacheEntry *entry)
+{
+	PGresult   *res;
+
+	if (entry->have_prep_stmt && entry->have_error)
+	{
+		res = PQexec(entry->conn, "DEALLOCATE ALL");
+		PQclear(res);
+	}
+	entry->have_prep_stmt = false;
+	entry->have_error = false;
 }
 
 /*
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index a31abce7c99..7a7772f5dd3 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -301,6 +301,12 @@ typedef struct
 	List	   *already_used;	/* expressions already dealt with */
 } ec_member_foreign_arg;
 
+bool		Use2PC = false;
+
+#ifndef PG_FDW_BUILTIN
+void		_PG_init(void);
+#endif
+
 /*
  * SQL functions
  */
@@ -6583,3 +6589,14 @@ find_em_expr_for_input_target(PlannerInfo *root,
 	elog(ERROR, "could not find pathkey item to sort");
 	return NULL;				/* keep compiler quiet */
 }
+
+#ifndef PG_FDW_BUILTIN
+void
+_PG_init(void)
+{
+	DefineCustomBoolVariable("postgres_fdw.use_twophase",
+							 "Use two phase commit for distributed transactions", NULL,
+							 &Use2PC, false, PGC_USERSET, 0, NULL,
+							 NULL, NULL);
+}
+#endif
diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h
index eef410db392..3c8cadc508a 100644
--- a/contrib/postgres_fdw/postgres_fdw.h
+++ b/contrib/postgres_fdw/postgres_fdw.h
@@ -208,4 +208,6 @@ extern const char *get_jointype_name(JoinType jointype);
 extern bool is_builtin(Oid objectId);
 extern bool is_shippable(Oid objectId, Oid classId, PgFdwRelationInfo *fpinfo);
 
+extern bool Use2PC;
+
 #endif							/* POSTGRES_FDW_H */

base-commit: 49d7165117893405ae9b5b8d8e7877acff33c0e7
-- 
2.19.1

#25

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: Alexey Kondratov (#24)

On 2020/09/05 3:31, Alexey Kondratov wrote:

Hi,

On 2020-07-27 09:44, Andrey V. Lepikhov wrote:

On 7/27/20 11:22 AM, tsunakawa.takay@fujitsu.com wrote:

US8356007B2 - Distributed transaction management for database systems with multiversioning - Google Patents
https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Thank you for the research (and previous links too).
I haven't seen this patent before. This should be carefully studied.

I had a look on the patch set, although it is quite outdated, especially on 0003.

Two thoughts about 0003:

First, IIUC atomicity of the distributed transaction in the postgres_fdw is achieved by the usage of 2PC. I think that this postgres_fdw 2PC support should be separated from global snapshots.

Agreed.

It could be useful to have such atomic distributed transactions even without a proper visibility, which is guaranteed by the global snapshot. Especially taking into account the doubts about Clock-SI and general questions about algorithm choosing criteria above in the thread.

Thus, I propose to split 0003 into two parts and add a separate GUC 'postgres_fdw.use_twophase', which could be turned on independently from 'postgres_fdw.use_global_snapshots'. Of course if the latter is enabled, then 2PC should be forcedly turned on as well.

Second, there are some problems with errors handling in the 0003 (thanks to Arseny Sher for review).
+error:
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ if (!res)
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ {
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ sql = psprintf("ABORT PREPARED '%s'", fdwTransState->gid);
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ BroadcastCmd(sql);
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ elog(ERROR, "Failed to PREPARE transaction on remote node");
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ }
It seems that we should never reach this point, just because BroadcastStmt will throw an ERROR if it fails to prepare transaction on the foreign server:
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ if (PQresultStatus(result) != expectedStatus ||
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ (handler && !handler(result, arg)))
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ {
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ elog(WARNING, "Failed command %s: status=%d, expected status=%d", sql, PQresultStatus(result), expectedStatus);
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ pgfdw_report_error(ERROR, result, entry->conn, true, sql);
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ allOk = false;
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ }
Moreover, It doesn't make much sense to try to abort prepared xacts, since if we failed to prepare it somewhere, then some foreign servers may become unavailable already and this doesn't provide us a 100% guarantee of clean up.
+ï¿½ï¿½ï¿½ /* COMMIT open transaction of we were doing 2PC */
+ï¿½ï¿½ï¿½ if (fdwTransState->two_phase_commit &&
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ (event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+ï¿½ï¿½ï¿½ {
+ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid));
+ï¿½ï¿½ï¿½ }
At this point, the host (local) transaction is already committed and there is no way to abort it gracefully. However, BroadcastCmd may rise an ERROR that will cause a PANIC, since it is non-recoverable state:

PANIC:ï¿½ cannot abort transaction 487, it was already committed

Attached is a patch, which implements a plain 2PC in the postgres_fdw and adds a GUC 'postgres_fdw.use_twophase'. Also it solves these errors handling issues above and tries to add proper comments everywhere. I think, that 0003 should be rebased on the top of it, or it could be a first patch in the set, since it may be used independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com. Do you have any thoughts
about pros and cons between your patch and Sawada-san's?

Regards,

[1]: /messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#26

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Fujii Masao (#25)

On 2020-09-08 05:49, Fujii Masao wrote:

On 2020/09/05 3:31, Alexey Kondratov wrote:

Attached is a patch, which implements a plain 2PC in the postgres_fdw
and adds a GUC 'postgres_fdw.use_twophase'. Also it solves these
errors handling issues above and tries to add proper comments
everywhere. I think, that 0003 should be rebased on the top of it, or
it could be a first patch in the set, since it may be used
independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]. Do you have any
thoughts
about pros and cons between your patch and Sawada-san's?

[1]
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there are
two major differences:

1. There is a built-in foreign xacts resolver in the [1], which should
be much more convenient from the end-user perspective. It involves huge
in-core changes and additional complexity that is of course worth of.

However, it's still not clear for me that it is possible to resolve all
foreign prepared xacts on the Postgres' own side with a 100% guarantee.
Imagine a situation when the coordinator node is actually a HA cluster
group (primary + sync + async replica) and it failed just after PREPARE
stage of after local COMMIT. In that case all foreign xacts will be left
in the prepared state. After failover process complete synchronous
replica will become a new primary. Would it have all required info to
properly resolve orphan prepared xacts?

Probably, this situation is handled properly in the [1], but I've not
yet finished a thorough reading of the patch set, though it has a great
doc!

On the other hand, previous 0003 and my proposed patch rely on either
manual resolution of hung prepared xacts or usage of external
monitor/resolver. This approach is much simpler from the in-core
perspective, but doesn't look as complete as [1] though.

2. In the patch from this thread all 2PC logic sit in the postgres_fdw,
while [1] tries to put it into the generic fdw core, which also feels
like a more general and architecturally correct way. However, how many
from the currently available dozens of various FDWs are capable to
perform 2PC? And how many of them are maintained well enough to adopt
this new API? This is not an argument against [1] actually, since
postgres_fdw is known to be the most advanced FDW and an early adopter
of new feature, just a little doubt about a usefulness of this
preliminary generalisation.

Anyway, I think that [1] is a great work and really hope to find more
time to investigate it deeper later this year.

Regards
--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

#27

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: Alexey Kondratov (#26)

On 2020/09/08 19:36, Alexey Kondratov wrote:

On 2020-09-08 05:49, Fujii Masao wrote:

On 2020/09/05 3:31, Alexey Kondratov wrote:

Attached is a patch, which implements a plain 2PC in the postgres_fdw and adds a GUC 'postgres_fdw.use_twophase'. Also it solves these errors handling issues above and tries to add proper comments everywhere. I think, that 0003 should be rebased on the top of it, or it could be a first patch in the set, since it may be used independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]. Do you have any thoughts
about pros and cons between your patch and Sawada-san's?

[1]
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there are two major differences:

Thanks for sharing your thought! As far as I read your patch quickly,
I basically agree with your this view.

1. There is a built-in foreign xacts resolver in the [1], which should be much more convenient from the end-user perspective. It involves huge in-core changes and additional complexity that is of course worth of.

However, it's still not clear for me that it is possible to resolve all foreign prepared xacts on the Postgres' own side with a 100% guarantee. Imagine a situation when the coordinator node is actually a HA cluster group (primary + sync + async replica) and it failed just after PREPARE stage of after local COMMIT. In that case all foreign xacts will be left in the prepared state. After failover process complete synchronous replica will become a new primary. Would it have all required info to properly resolve orphan prepared xacts?

IIUC, yes, the information required for automatic resolution is
WAL-logged and the standby tries to resolve those orphan transactions
from WAL after the failover. But Sawada-san's patch provides
the special function for manual resolution, so there may be some cases
where manual resolution is necessary.

Probably, this situation is handled properly in the [1], but I've not yet finished a thorough reading of the patch set, though it has a great doc!

On the other hand, previous 0003 and my proposed patch rely on either manual resolution of hung prepared xacts or usage of external monitor/resolver. This approach is much simpler from the in-core perspective, but doesn't look as complete as [1] though.

2. In the patch from this thread all 2PC logic sit in the postgres_fdw, while [1] tries to put it into the generic fdw core, which also feels like a more general and architecturally correct way. However, how many from the currently available dozens of various FDWs are capable to perform 2PC? And how many of them are maintained well enough to adopt this new API? This is not an argument against [1] actually, since postgres_fdw is known to be the most advanced FDW and an early adopter of new feature, just a little doubt about a usefulness of this preliminary generalisation.

If we implement 2PC feature only for PostgreSQL sharding using
postgres_fdw, IMO it's ok to support only postgres_fdw.
But if we implement 2PC as the improvement on FDW independently
from PostgreSQL sharding and global visibility, I think that it's
necessary to support other FDW. I'm not sure how many FDW
actually will support this new 2PC interface. But if the interface is
not so complicated, I *guess* some FDW will support it in the near future.

Implementing 2PC feature only inside postgres_fdw seems to cause
another issue; COMMIT PREPARED is issued to the remote servers
after marking the local transaction as committed
(i.e., ProcArrayEndTransaction()). Is this safe? This issue happens
because COMMIT PREPARED is issued via
CallXactCallbacks(XACT_EVENT_COMMIT) and that CallXactCallbacks()
is called after ProcArrayEndTransaction().

Anyway, I think that [1] is a great work and really hope to find more time to investigate it deeper later this year.

I'm sure your work is also great! I hope we can discuss the design
of 2PC feature together!

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#28

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Fujii Masao (#27)

On 2020-09-08 14:48, Fujii Masao wrote:

On 2020/09/08 19:36, Alexey Kondratov wrote:

On 2020-09-08 05:49, Fujii Masao wrote:

On 2020/09/05 3:31, Alexey Kondratov wrote:

Attached is a patch, which implements a plain 2PC in the
postgres_fdw and adds a GUC 'postgres_fdw.use_twophase'. Also it
solves these errors handling issues above and tries to add proper
comments everywhere. I think, that 0003 should be rebased on the top
of it, or it could be a first patch in the set, since it may be used
independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]. Do you have any
thoughts
about pros and cons between your patch and Sawada-san's?

[1]
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there
are two major differences:

Thanks for sharing your thought! As far as I read your patch quickly,
I basically agree with your this view.

1. There is a built-in foreign xacts resolver in the [1], which should
be much more convenient from the end-user perspective. It involves
huge in-core changes and additional complexity that is of course worth
of.

However, it's still not clear for me that it is possible to resolve
all foreign prepared xacts on the Postgres' own side with a 100%
guarantee. Imagine a situation when the coordinator node is actually a
HA cluster group (primary + sync + async replica) and it failed just
after PREPARE stage of after local COMMIT. In that case all foreign
xacts will be left in the prepared state. After failover process
complete synchronous replica will become a new primary. Would it have
all required info to properly resolve orphan prepared xacts?

IIUC, yes, the information required for automatic resolution is
WAL-logged and the standby tries to resolve those orphan transactions
from WAL after the failover. But Sawada-san's patch provides
the special function for manual resolution, so there may be some cases
where manual resolution is necessary.

I've found a note about manual resolution in the v25 0002:

+After that we prepare all foreign transactions by calling
+PrepareForeignTransaction() API. If we failed on any of them we change 
to
+rollback, therefore at this time some participants might be prepared 
whereas
+some are not prepared. The former foreign transactions need to be 
resolved
+using pg_resolve_foreign_xact() manually and the latter ends 
transaction
+in one-phase by calling RollbackForeignTransaction() API.

but it's not yet clear for me.

Implementing 2PC feature only inside postgres_fdw seems to cause
another issue; COMMIT PREPARED is issued to the remote servers
after marking the local transaction as committed
(i.e., ProcArrayEndTransaction()).

According to the Sawada-san's v25 0002 the logic is pretty much the same
there:

+2. Pre-Commit phase (1st phase of two-phase commit)

+3. Commit locally
+Once we've prepared all of them, commit the transaction locally.

+4. Post-Commit Phase (2nd phase of two-phase commit)

Brief look at the code confirms this scheme. IIUC, AtEOXact_FdwXact /
FdwXactParticipantEndTransaction happens after ProcArrayEndTransaction()
in the CommitTransaction(). Thus, I don't see many difference between
these approach and CallXactCallbacks() usage regarding this point.

Is this safe? This issue happens
because COMMIT PREPARED is issued via
CallXactCallbacks(XACT_EVENT_COMMIT) and that CallXactCallbacks()
is called after ProcArrayEndTransaction().

Once the transaction is committed locally any ERROR (or higher level
message) will be escalated to PANIC. And I do see possible ERROR level
messages in the postgresCommitForeignTransaction() for example:

+	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+		ereport(ERROR, (errmsg("could not commit transaction on server %s",
+							   frstate->server->servername)));

I don't think that it's very convenient to get a PANIC every time we
failed to commit one of the prepared foreign xacts, since it could be
not so rare in the distributed system. That's why I tried to get rid of
possible ERRORs as far as possible in my proposed patch.

Regards
--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

#29

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

over 5 years ago

In reply to: Alexey Kondratov (#28)

On Wed, 9 Sep 2020 at 02:00, Alexey Kondratov
<a.kondratov@postgrespro.ru> wrote:

On 2020-09-08 14:48, Fujii Masao wrote:

On 2020/09/08 19:36, Alexey Kondratov wrote:

On 2020-09-08 05:49, Fujii Masao wrote:

On 2020/09/05 3:31, Alexey Kondratov wrote:

Attached is a patch, which implements a plain 2PC in the
postgres_fdw and adds a GUC 'postgres_fdw.use_twophase'. Also it
solves these errors handling issues above and tries to add proper
comments everywhere. I think, that 0003 should be rebased on the top
of it, or it could be a first patch in the set, since it may be used
independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]. Do you have any
thoughts
about pros and cons between your patch and Sawada-san's?

[1]
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there
are two major differences:

Thanks for sharing your thought! As far as I read your patch quickly,
I basically agree with your this view.

1. There is a built-in foreign xacts resolver in the [1], which should
be much more convenient from the end-user perspective. It involves
huge in-core changes and additional complexity that is of course worth
of.

However, it's still not clear for me that it is possible to resolve
all foreign prepared xacts on the Postgres' own side with a 100%
guarantee. Imagine a situation when the coordinator node is actually a
HA cluster group (primary + sync + async replica) and it failed just
after PREPARE stage of after local COMMIT. In that case all foreign
xacts will be left in the prepared state. After failover process
complete synchronous replica will become a new primary. Would it have
all required info to properly resolve orphan prepared xacts?

IIUC, yes, the information required for automatic resolution is
WAL-logged and the standby tries to resolve those orphan transactions
from WAL after the failover. But Sawada-san's patch provides
the special function for manual resolution, so there may be some cases
where manual resolution is necessary.

I've found a note about manual resolution in the v25 0002:
+After that we prepare all foreign transactions by calling
+PrepareForeignTransaction() API. If we failed on any of them we change
to
+rollback, therefore at this time some participants might be prepared
whereas
+some are not prepared. The former foreign transactions need to be
resolved
+using pg_resolve_foreign_xact() manually and the latter ends
transaction
+in one-phase by calling RollbackForeignTransaction() API.
but it's not yet clear for me.

Sorry, the above description in README is out of date. In the v25
patch, it's true that if a backend fails to prepare a transaction on a
foreign server, it’s possible that some foreign transactions are
prepared whereas others are not. But at the end of the transaction
after changing to rollback, the process does rollback (or rollback
prepared) all of them. So the use case of pg_resolve_foreign_xact() is
to resolve orphaned foreign prepared transactions or to resolve a
foreign transaction that is not resolved for some reasons, bugs etc.

Implementing 2PC feature only inside postgres_fdw seems to cause
another issue; COMMIT PREPARED is issued to the remote servers
after marking the local transaction as committed
(i.e., ProcArrayEndTransaction()).

According to the Sawada-san's v25 0002 the logic is pretty much the same
there:

+2. Pre-Commit phase (1st phase of two-phase commit)
+3. Commit locally
+Once we've prepared all of them, commit the transaction locally.
+4. Post-Commit Phase (2nd phase of two-phase commit)

Brief look at the code confirms this scheme. IIUC, AtEOXact_FdwXact /
FdwXactParticipantEndTransaction happens after ProcArrayEndTransaction()
in the CommitTransaction(). Thus, I don't see many difference between
these approach and CallXactCallbacks() usage regarding this point.

Is this safe? This issue happens
because COMMIT PREPARED is issued via
CallXactCallbacks(XACT_EVENT_COMMIT) and that CallXactCallbacks()
is called after ProcArrayEndTransaction().

Once the transaction is committed locally any ERROR (or higher level
message) will be escalated to PANIC.

I think this is true only inside the critical section and it's not
necessarily true for all errors happening after the local commit,
right?

And I do see possible ERROR level
messages in the postgresCommitForeignTransaction() for example:
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+               ereport(ERROR, (errmsg("could not commit transaction on server %s",
+                                                          frstate->server->servername)));
I don't think that it's very convenient to get a PANIC every time we
failed to commit one of the prepared foreign xacts, since it could be
not so rare in the distributed system. That's why I tried to get rid of
possible ERRORs as far as possible in my proposed patch.

In my patch, the second phase of 2PC is executed only by the resolver
process. Therefore, even if an error would happen during committing a
foreign prepared transaction, we just need to relaunch the resolver
process and trying again. During that, the backend process will be
just waiting. If a backend process raises an error after the local
commit, the client will see transaction failure despite the local
transaction having been committed. An error could happen even by
palloc. So the patch uses a background worker to commit prepared
foreign transactions, not by backend itself.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#30

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Masahiko Sawada (#29)

On 2020-09-09 08:35, Masahiko Sawada wrote:

On Wed, 9 Sep 2020 at 02:00, Alexey Kondratov
<a.kondratov@postgrespro.ru> wrote:
On 2020-09-08 14:48, Fujii Masao wrote:

IIUC, yes, the information required for automatic resolution is
WAL-logged and the standby tries to resolve those orphan transactions
from WAL after the failover. But Sawada-san's patch provides
the special function for manual resolution, so there may be some cases
where manual resolution is necessary.

I've found a note about manual resolution in the v25 0002:
+After that we prepare all foreign transactions by calling
+PrepareForeignTransaction() API. If we failed on any of them we 
change
to
+rollback, therefore at this time some participants might be prepared
whereas
+some are not prepared. The former foreign transactions need to be
resolved
+using pg_resolve_foreign_xact() manually and the latter ends
transaction
+in one-phase by calling RollbackForeignTransaction() API.
but it's not yet clear for me.
Sorry, the above description in README is out of date. In the v25
patch, it's true that if a backend fails to prepare a transaction on a
foreign server, it’s possible that some foreign transactions are
prepared whereas others are not. But at the end of the transaction
after changing to rollback, the process does rollback (or rollback
prepared) all of them. So the use case of pg_resolve_foreign_xact() is
to resolve orphaned foreign prepared transactions or to resolve a
foreign transaction that is not resolved for some reasons, bugs etc.

OK, thank you for the explanation!

Once the transaction is committed locally any ERROR (or higher level
message) will be escalated to PANIC.

I think this is true only inside the critical section and it's not
necessarily true for all errors happening after the local commit,
right?

It's not actually related to critical section errors escalation. Any
error in the backend after the local commit and
ProcArrayEndTransaction() will try to abort the current transaction and
do RecordTransactionAbort(), but it's too late to do so and PANIC will
be risen:

/*
* Check that we haven't aborted halfway through
RecordTransactionCommit.
*/
if (TransactionIdDidCommit(xid))
elog(PANIC, "cannot abort transaction %u, it was already committed",
xid);

At least that's how I understand it.

And I do see possible ERROR level
messages in the postgresCommitForeignTransaction() for example:
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+               ereport(ERROR, (errmsg("could not commit transaction 
on server %s",
+                                                          
frstate->server->servername)));
I don't think that it's very convenient to get a PANIC every time we
failed to commit one of the prepared foreign xacts, since it could be
not so rare in the distributed system. That's why I tried to get rid
of
possible ERRORs as far as possible in my proposed patch.
In my patch, the second phase of 2PC is executed only by the resolver
process. Therefore, even if an error would happen during committing a
foreign prepared transaction, we just need to relaunch the resolver
process and trying again. During that, the backend process will be
just waiting. If a backend process raises an error after the local
commit, the client will see transaction failure despite the local
transaction having been committed. An error could happen even by
palloc. So the patch uses a background worker to commit prepared
foreign transactions, not by backend itself.

Yes, if it's a background process, then it seems to be safe.

BTW, it seems that I've chosen a wrong thread for posting my patch and
staring a discussion :) Activity from this thread moved to [1]/messages/by-id/2020081009525213277261@highgo.ca and you
solution with built-in resolver is discussed [2]/messages/by-id/CAExHW5uBy9QwjdSO4j82WC4aeW-Q4n2ouoZ1z70o=8Vb0skqYQ@mail.gmail.com. I'll try to take a
look on v25 closely and write to [2]/messages/by-id/CAExHW5uBy9QwjdSO4j82WC4aeW-Q4n2ouoZ1z70o=8Vb0skqYQ@mail.gmail.com instead.

[1]: /messages/by-id/2020081009525213277261@highgo.ca
/messages/by-id/2020081009525213277261@highgo.ca

[2]: /messages/by-id/CAExHW5uBy9QwjdSO4j82WC4aeW-Q4n2ouoZ1z70o=8Vb0skqYQ@mail.gmail.com
/messages/by-id/CAExHW5uBy9QwjdSO4j82WC4aeW-Q4n2ouoZ1z70o=8Vb0skqYQ@mail.gmail.com

Regards
--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

#31

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: Alexey Kondratov (#28)

On 2020/09/09 2:00, Alexey Kondratov wrote:

On 2020-09-08 14:48, Fujii Masao wrote:

On 2020/09/08 19:36, Alexey Kondratov wrote:

On 2020-09-08 05:49, Fujii Masao wrote:

On 2020/09/05 3:31, Alexey Kondratov wrote:

Attached is a patch, which implements a plain 2PC in the postgres_fdw and adds a GUC 'postgres_fdw.use_twophase'. Also it solves these errors handling issues above and tries to add proper comments everywhere. I think, that 0003 should be rebased on the top of it, or it could be a first patch in the set, since it may be used independently. What do you think?

Thanks for the patch!

Sawada-san was proposing another 2PC patch at [1]. Do you have any thoughts
about pros and cons between your patch and Sawada-san's?

[1]
/messages/by-id/CA+fd4k4z6_B1ETEvQamwQhu4RX7XsrN5ORL7OhJ4B5B6sW-RgQ@mail.gmail.com

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there are two major differences:

Thanks for sharing your thought! As far as I read your patch quickly,
I basically agree with your this view.

1. There is a built-in foreign xacts resolver in the [1], which should be much more convenient from the end-user perspective. It involves huge in-core changes and additional complexity that is of course worth of.

However, it's still not clear for me that it is possible to resolve all foreign prepared xacts on the Postgres' own side with a 100% guarantee. Imagine a situation when the coordinator node is actually a HA cluster group (primary + sync + async replica) and it failed just after PREPARE stage of after local COMMIT. In that case all foreign xacts will be left in the prepared state. After failover process complete synchronous replica will become a new primary. Would it have all required info to properly resolve orphan prepared xacts?

IIUC, yes, the information required for automatic resolution is
WAL-logged and the standby tries to resolve those orphan transactions
from WAL after the failover. But Sawada-san's patch provides
the special function for manual resolution, so there may be some cases
where manual resolution is necessary.

I've found a note about manual resolution in the v25 0002:
+After that we prepare all foreign transactions by calling
+PrepareForeignTransaction() API. If we failed on any of them we change to
+rollback, therefore at this time some participants might be prepared whereas
+some are not prepared. The former foreign transactions need to be resolved
+using pg_resolve_foreign_xact() manually and the latter ends transaction
+in one-phase by calling RollbackForeignTransaction() API.
but it's not yet clear for me.

Implementing 2PC feature only inside postgres_fdw seems to cause
another issue; COMMIT PREPARED is issued to the remote servers
after marking the local transaction as committed
(i.e., ProcArrayEndTransaction()).

According to the Sawada-san's v25 0002 the logic is pretty much the same there:

+2. Pre-Commit phase (1st phase of two-phase commit)
+3. Commit locally
+Once we've prepared all of them, commit the transaction locally.
+4. Post-Commit Phase (2nd phase of two-phase commit)

Brief look at the code confirms this scheme. IIUC, AtEOXact_FdwXact / FdwXactParticipantEndTransaction happens after ProcArrayEndTransaction() in the CommitTransaction(). Thus, I don't see many difference between these approach and CallXactCallbacks() usage regarding this point.

IIUC the commit logic in Sawada-san's patch looks like

1. PreCommit_FdwXact()
PREPARE TRANSACTION command is issued

2. RecordTransactionCommit()
2-1. WAL-log the commit record
2-2. Update CLOG
2-3. Wait for sync rep
2-4. FdwXactWaitForResolution()
Wait until COMMIT PREPARED commands are issued to the remote servers and completed.

3. ProcArrayEndTransaction()
4. AtEOXact_FdwXact(true)

So ISTM that the timing of when COMMIT PREPARED is issued
to the remote server is different between the patches.
Am I missing something?

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#32

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: Andrey V. Lepikhov (#23)

RE: Global snapshots

Hi Andrey san,

From: Andrey V. Lepikhov <a.lepikhov@postgrespro.ru>> > From: tsunakawa.takay@fujitsu.com <tsunakawa.takay@fujitsu.com>

While Clock-SI seems to be considered the best promising for global

Could you take a look at this patent? I'm afraid this is the Clock-SI for MVCC.

Microsoft holds this until 2031. I couldn't find this with the keyword
"Clock-SI.""

US8356007B2 - Distributed transaction management for database systems

with multiversioning - Google Patents

https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Thank you for the research (and previous links too).
I haven't seen this patent before. This should be carefully studied.

I wanted to ask about this after I've published the revised scale-out design wiki, but I'm taking too long, so could you share your study results? I think we need to make it clear about the patent before discussing the code. After we hear your opinion, we also have to check to see if Clock-SI is patented or avoid it by modifying part of the algorithm. Just in case we cannot use it, we have to proceed with thinking about alternatives.

Regards
Takayuki Tsunakawa

#33

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#32)

On 2020/09/10 10:38, tsunakawa.takay@fujitsu.com wrote:

Hi Andrey san,

From: Andrey V. Lepikhov <a.lepikhov@postgrespro.ru>> > From: tsunakawa.takay@fujitsu.com <tsunakawa.takay@fujitsu.com>

While Clock-SI seems to be considered the best promising for global

Could you take a look at this patent? I'm afraid this is the Clock-SI for MVCC.

Microsoft holds this until 2031. I couldn't find this with the keyword
"Clock-SI.""

US8356007B2 - Distributed transaction management for database systems

with multiversioning - Google Patents

https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

Thank you for the research (and previous links too).
I haven't seen this patent before. This should be carefully studied.

I wanted to ask about this after I've published the revised scale-out design wiki, but I'm taking too long, so could you share your study results? I think we need to make it clear about the patent before discussing the code.

Yes.

But I'm concerned about that it's really hard to say there is no patent risk
around that. I'm not sure who can judge there is no patent risk,
in the community. Maybe no one? Anyway, I was thinking that Google Spanner,
YugabyteDB, etc use the global transaction approach based on the clock
similar to Clock-SI. Since I've never heard they have the patent issues,
I was just thinking Clock-SI doesn't have. No? This type of *guess* is not
safe, though...

After we hear your opinion, we also have to check to see if Clock-SI is patented or avoid it by modifying part of the algorithm. Just in case we cannot use it, we have to proceed with thinking about alternatives.

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#34

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: Fujii Masao (#33)

RE: Global snapshots

From: Fujii Masao <masao.fujii@oss.nttdata.com>

But I'm concerned about that it's really hard to say there is no patent risk
around that. I'm not sure who can judge there is no patent risk,
in the community. Maybe no one? Anyway, I was thinking that Google Spanner,
YugabyteDB, etc use the global transaction approach based on the clock
similar to Clock-SI. Since I've never heard they have the patent issues,
I was just thinking Clock-SI doesn't have. No? This type of *guess* is not
safe, though...

Hm, it may be difficult to be sure that the algorithm does not violate a patent. But it may not be difficult to know if the algorithm apparently violates a patent or is highly likely (for those who know Clock-SI well.) At least, Andrey-san seems to have felt that it needs careful study, so I guess he had some hunch.

I understand this community is sensitive to patents. After the discussions at and after PGCon 2018, the community concluded that it won't accept patented technology. In the distant past, the community released Postgres 8.0 that contains an IBM's pending patent ARC, and removed it in 8.0.2. I wonder how could this could be detected, and how hard to cope with the patent issue. Bruce warned that we should be careful not to violate Greenplum's patents.

E.25. Release 8.0.2
https://www.postgresql.org/docs/8.0/release-8-0-2.html
--------------------------------------------------
New cache management algorithm 2Q replaces ARC (Tom)
This was done to avoid a pending US patent on ARC. The 2Q code might be a few percentage points slower than ARC for some work loads. A better cache management algorithm will appear in 8.1.
--------------------------------------------------

I think I'll try to contact the people listed in Clock-SI paper and the Microsoft patent to ask about this. I'm going to have a late summer vacation next week, so this is my summer homework?

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

Regards
Takayuki Tsunakawa

#35

Fujii Masao

masao.fujii@oss.nttdata.com

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#34)

On 2020/09/10 18:01, tsunakawa.takay@fujitsu.com wrote:

From: Fujii Masao <masao.fujii@oss.nttdata.com>

But I'm concerned about that it's really hard to say there is no patent risk
around that. I'm not sure who can judge there is no patent risk,
in the community. Maybe no one? Anyway, I was thinking that Google Spanner,
YugabyteDB, etc use the global transaction approach based on the clock
similar to Clock-SI. Since I've never heard they have the patent issues,
I was just thinking Clock-SI doesn't have. No? This type of *guess* is not
safe, though...

Hm, it may be difficult to be sure that the algorithm does not violate a patent. But it may not be difficult to know if the algorithm apparently violates a patent or is highly likely (for those who know Clock-SI well.) At least, Andrey-san seems to have felt that it needs careful study, so I guess he had some hunch.

I understand this community is sensitive to patents. After the discussions at and after PGCon 2018, the community concluded that it won't accept patented technology. In the distant past, the community released Postgres 8.0 that contains an IBM's pending patent ARC, and removed it in 8.0.2. I wonder how could this could be detected, and how hard to cope with the patent issue. Bruce warned that we should be careful not to violate Greenplum's patents.

E.25. Release 8.0.2
https://www.postgresql.org/docs/8.0/release-8-0-2.html
--------------------------------------------------
New cache management algorithm 2Q replaces ARC (Tom)
This was done to avoid a pending US patent on ARC. The 2Q code might be a few percentage points slower than ARC for some work loads. A better cache management algorithm will appear in 8.1.
--------------------------------------------------

I think I'll try to contact the people listed in Clock-SI paper and the Microsoft patent to ask about this.

Thanks!

I'm going to have a late summer vacation next week, so this is my summer homework?

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

But it may be worth revisiting this idea if we cannot avoid the patent issue.

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#36

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Fujii Masao (#31)

On 2020-09-09 20:29, Fujii Masao wrote:

On 2020/09/09 2:00, Alexey Kondratov wrote:
According to the Sawada-san's v25 0002 the logic is pretty much the
same there:

+2. Pre-Commit phase (1st phase of two-phase commit)
+3. Commit locally
+Once we've prepared all of them, commit the transaction locally.
+4. Post-Commit Phase (2nd phase of two-phase commit)

Brief look at the code confirms this scheme. IIUC, AtEOXact_FdwXact /
FdwXactParticipantEndTransaction happens after
ProcArrayEndTransaction() in the CommitTransaction(). Thus, I don't
see many difference between these approach and CallXactCallbacks()
usage regarding this point.
IIUC the commit logic in Sawada-san's patch looks like

1. PreCommit_FdwXact()
PREPARE TRANSACTION command is issued

2. RecordTransactionCommit()
2-1. WAL-log the commit record
2-2. Update CLOG
2-3. Wait for sync rep
2-4. FdwXactWaitForResolution()
Wait until COMMIT PREPARED commands are issued to the
remote servers and completed.

3. ProcArrayEndTransaction()
4. AtEOXact_FdwXact(true)

So ISTM that the timing of when COMMIT PREPARED is issued
to the remote server is different between the patches.
Am I missing something?

No, you are right, sorry. At a first glance I thought that
AtEOXact_FdwXact is responsible for COMMIT PREPARED as well, but it is
only calling FdwXactParticipantEndTransaction in the abort case.

Regards
--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

#37

Amit Kapila

amit.kapila16@gmail.com

over 5 years ago

In reply to: Fujii Masao (#35)

On Thu, Sep 10, 2020 at 4:20 PM Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

But it may be worth revisiting this idea if we cannot avoid the patent issue.

It is not very clear what exactly we can do about the point raised by
Tsunakawa-San related to patent in this technology as I haven't seen
that discussed during other development but maybe we can try to study
a bit. One more thing I would like to bring here is that it seems to
be there have been some concerns about this idea when originally
discussed [1]/messages/by-id/21BC916B-80A1-43BF-8650-3363CCDAE09C@postgrespro.ru. It is not very clear to me if all the concerns are
addressed or not. If one can summarize the concerns discussed and how
the latest patch is able to address those then it will be great.

Also, I am not sure but maybe global deadlock detection also needs to
be considered as that also seems to be related because it depends on
how we manage global transactions. We need to prevent deadlock among
transaction operations spanned across multiple nodes. Say a
transaction T-1 has updated row r-1 of tbl-1 on node-1 and tries to
update row r-1 of tbl-2 on node n-2. Similarly, a transaction T-2
tries to perform those two operations in reverse order. Now, this will
lead to the deadlock that spans across multiple nodes and our current
deadlock detector doesn't have that capability. Having some form of
global/distributed transaction id might help to resolve it but not
sure how it can be solved with this clock-si based algorithm.

As all these problems are related, that is why I am insisting on this
thread and other thread "Transactions involving multiple postgres
foreign servers" [2]/messages/by-id/CAA4eK1J86S=meivVsH+oy=TwUC+yr9jj2VtmmqMfYRmgs2JzUA@mail.gmail.com to have a high-level idea on how the distributed
transaction management will work before we decide on a particular
approach and commit one part of that patch.

[1]: /messages/by-id/21BC916B-80A1-43BF-8650-3363CCDAE09C@postgrespro.ru
[2]: /messages/by-id/CAA4eK1J86S=meivVsH+oy=TwUC+yr9jj2VtmmqMfYRmgs2JzUA@mail.gmail.com

--
With Regards,
Amit Kapila.

#38

Bruce Momjian

bruce@momjian.us

over 5 years ago

In reply to: Alexey Kondratov (#26)

On Tue, Sep 8, 2020 at 01:36:16PM +0300, Alexey Kondratov wrote:

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there are two
major differences:

1. There is a built-in foreign xacts resolver in the [1], which should be
much more convenient from the end-user perspective. It involves huge in-core
changes and additional complexity that is of course worth of.

However, it's still not clear for me that it is possible to resolve all
foreign prepared xacts on the Postgres' own side with a 100% guarantee.
Imagine a situation when the coordinator node is actually a HA cluster group
(primary + sync + async replica) and it failed just after PREPARE stage of
after local COMMIT. In that case all foreign xacts will be left in the
prepared state. After failover process complete synchronous replica will
become a new primary. Would it have all required info to properly resolve
orphan prepared xacts?

Probably, this situation is handled properly in the [1], but I've not yet
finished a thorough reading of the patch set, though it has a great doc!

On the other hand, previous 0003 and my proposed patch rely on either manual
resolution of hung prepared xacts or usage of external monitor/resolver.
This approach is much simpler from the in-core perspective, but doesn't look
as complete as [1] though.

Have we considered how someone would clean up foreign transactions if the
coordinating server dies? Could it be done manually? Would an external
resolver, rather than an internal one, make this easier?

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

#39

Alexey Kondratov

a.kondratov@postgrespro.ru

over 5 years ago

In reply to: Bruce Momjian (#38)

On 2020-09-18 00:54, Bruce Momjian wrote:

On Tue, Sep 8, 2020 at 01:36:16PM +0300, Alexey Kondratov wrote:

Thank you for the link!

After a quick look on the Sawada-san's patch set I think that there
are two
major differences:

1. There is a built-in foreign xacts resolver in the [1], which should
be
much more convenient from the end-user perspective. It involves huge
in-core
changes and additional complexity that is of course worth of.

However, it's still not clear for me that it is possible to resolve
all
foreign prepared xacts on the Postgres' own side with a 100%
guarantee.
Imagine a situation when the coordinator node is actually a HA cluster
group
(primary + sync + async replica) and it failed just after PREPARE
stage of
after local COMMIT. In that case all foreign xacts will be left in the
prepared state. After failover process complete synchronous replica
will
become a new primary. Would it have all required info to properly
resolve
orphan prepared xacts?

Probably, this situation is handled properly in the [1], but I've not
yet
finished a thorough reading of the patch set, though it has a great
doc!

On the other hand, previous 0003 and my proposed patch rely on either
manual
resolution of hung prepared xacts or usage of external
monitor/resolver.
This approach is much simpler from the in-core perspective, but
doesn't look
as complete as [1] though.

Have we considered how someone would clean up foreign transactions if
the
coordinating server dies? Could it be done manually? Would an
external
resolver, rather than an internal one, make this easier?

Both Sawada-san's patch [1]/messages/by-id/CA+fd4k4HOVqqC5QR4H984qvD0Ca9g=1oLYdrJT_18zP9t+UsJg@mail.gmail.com and in this thread (e.g. mine [2]/messages/by-id/3ef7877bfed0582019eab3d462a43275@postgrespro.ru) use 2PC
with a special gid format including a xid + server identification info.
Thus, one can select from pg_prepared_xacts, get xid and coordinator
info, then use txid_status() on the coordinator (or ex-coordinator) to
get transaction status and finally either commit or abort these stale
prepared xacts. Of course this could be wrapped into some user-level
support routines as it is done in the [1]/messages/by-id/CA+fd4k4HOVqqC5QR4H984qvD0Ca9g=1oLYdrJT_18zP9t+UsJg@mail.gmail.com.

As for the benefits of using an external resolver, I think that there
are some of them from the whole system perspective:

1) If one follows the logic above, then this resolver could be
stateless, it takes all the required info from the Postgres nodes
themselves.

2) Then you can easily put it into container, which make it easier do
deploy to all these 'cloud' stuff like kubernetes.

3) Also you can scale resolvers independently from Postgres nodes.

I do not think that either of these points is a game changer, but we use
a very simple external resolver altogether with [2]/messages/by-id/3ef7877bfed0582019eab3d462a43275@postgrespro.ru in our sharding
prototype and it works just fine so far.

[1]: /messages/by-id/CA+fd4k4HOVqqC5QR4H984qvD0Ca9g=1oLYdrJT_18zP9t+UsJg@mail.gmail.com
/messages/by-id/CA+fd4k4HOVqqC5QR4H984qvD0Ca9g=1oLYdrJT_18zP9t+UsJg@mail.gmail.com

[2]: /messages/by-id/3ef7877bfed0582019eab3d462a43275@postgrespro.ru
/messages/by-id/3ef7877bfed0582019eab3d462a43275@postgrespro.ru

--
Alexey Kondratov

Postgres Professional https://www.postgrespro.com
Russian Postgres Company

#40

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: Andrey V. Lepikhov (#23)

RE: Global snapshots

Hi Andrey-san, all,

From: Andrey V. Lepikhov <a.lepikhov@postgrespro.ru>

On 7/27/20 11:22 AM, tsunakawa.takay@fujitsu.com wrote:

Could you take a look at this patent? I'm afraid this is the Clock-SI for MVCC.

Microsoft holds this until 2031. I couldn't find this with the keyword
"Clock-SI.""

US8356007B2 - Distributed transaction management for database systems

with multiversioning - Google Patents

https://patents.google.com/patent/US8356007

If it is, can we circumvent this patent?

I haven't seen this patent before. This should be carefully studied.

I contacted 6 people individually, 3 holders of the patent and different 3 authors of the Clock-SI paper. I got replies from two people. (It's a regret I couldn't get a reply from the main author of Clock-SI paper.)

[Reply from the patent holder Per-Ake Larson]
--------------------------------------------------
Thanks for your interest in my patent.

The answer to your question is: No, Clock-SI is not based on the patent - it was an entirely independent development. The two approaches are similar in the sense that there is no global clock, the commit time of a distributed transaction is the same in every partition where it modified data, and a transaction gets it snapshot timestamp from a local clock. The difference is whether a distributed transaction gets its commit timestamp before or after the prepare phase in 2PC.

Hope this helpful.

Best regards,
Per-Ake
--------------------------------------------------

[Reply from the Clock-SI author Willy Zwaenepoel]
--------------------------------------------------
Thank you for your kind words about our work.

I was unaware of this patent at the time I wrote the paper. The two came out more or less at the same time.

I am not a lawyer, so I cannot tell you if something based on Clock-SI would infringe on the Microsoft patent. The main distinction to me seems to be that Clock-SI is based on physical clocks, while the Microsoft patent talks about logical clocks, but again I am not a lawyer.

Best regards,

Willy.
--------------------------------------------------

Does this make sense from your viewpoint, and can we think that we can use Clock-SI without infrindging on the patent? According to the patent holder, the difference between Clock-SI and the patent seems to be fewer than the similarities.

Regards
Takayuki Tsunakawa

#41

Andrey Lepikhov

a.lepikhov@postgrespro.ru

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#40)

22.09.2020 03:47, tsunakawa.takay@fujitsu.com пишет:

Does this make sense from your viewpoint, and can we think that we can use Clock-SI without infrindging on the patent? According to the patent holder, the difference between Clock-SI and the patent seems to be fewer than the similarities.

Thank you for this work!
As I can see, main development difficulties placed in other areas: CSN,
resolver, global deadlocks, 2PC commit... I'm not lawyer too. But if we
get remarks from the patent holders, we can rewrite our Clock-SI
implementation.

--
regards,
Andrey Lepikhov
Postgres Professional

#42

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: Andrey Lepikhov (#41)

RE: Global snapshots

From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>

Thank you for this work!
As I can see, main development difficulties placed in other areas: CSN, resolver,
global deadlocks, 2PC commit... I'm not lawyer too. But if we get remarks from
the patent holders, we can rewrite our Clock-SI implementation.

Yeah, I understand your feeling. I personally don't want like patents, and don't want to be disturbed by them. But the world is not friendly... We are not a lawyer, but we have to do our best to make sure PostgreSQL will be patent-free by checking the technologies as engineers.

Among the above items, CSN is the only concerning one. Other items are written in textbooks, well-known, and used in other DBMSs, so they should be free from patents. However, CSN is not (at least to me.) Have you checked if CSN is not related to some patent? Or is CSN or similar technology already widely used in famous software and we can regard it as patent-free?

And please wait. As below, the patent holder just says that Clock-SI is not based on the patent and an independent development. He doesn't say Clock-SI does not overlap with the patent or implementing Clock-SI does not infringe on the patent. Rather, he suggests that Clock-SI has many similarities and thus those may match the claims of the patent (unintentionally?) I felt this is a sign of risking infringement.

"The answer to your question is: No, Clock-SI is not based on the patent - it was an entirely independent development. The two approaches are similar in the sense that there is no global clock, the commit time of a distributed transaction is the same in every partition where it modified data, and a transaction gets it snapshot timestamp from a local clock. The difference is whether a distributed transaction gets its commit timestamp before or after the prepare phase in 2PC."

The timeline of events also worries me. It seems unnatural to consider that Clock-SI and the patent are independent.

2010/6 - 2010/9 One Clock-SI author worked for Microsoft Research as an research intern
2010/10 Microsoft filed the patent
2011/9 - 2011/12 The same Clock-SI author worked for Microsoft Research as an research intern
2013 The same author moved to EPFL and published the Clock-SI paper with another author who has worked for Microsoft Research since then.

So, could you give your opinion whether we can use Clock-SI without overlapping with the patent claims? I also will try to check and see, so that I can understand your technical analysis.

And I've just noticed that I got in touch with another author of Clock-SI via SNS, and sent an inquiry to him. I'll report again when I have a reply.

Regards
Takayuki Tsunakawa

#43

tsunakawa.takay@fujitsu.com

over 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#42)

RE: Global snapshots

Hi Andrey san, all,

From: tsunakawa.takay@fujitsu.com <tsunakawa.takay@fujitsu.com>

And please wait. As below, the patent holder just says that Clock-SI is not
based on the patent and an independent development. He doesn't say
Clock-SI does not overlap with the patent or implementing Clock-SI does not
infringe on the patent. Rather, he suggests that Clock-SI has many
similarities and thus those may match the claims of the patent
(unintentionally?) I felt this is a sign of risking infringement.

"The answer to your question is: No, Clock-SI is not based on the patent - it
was an entirely independent development. The two approaches are similar in
the sense that there is no global clock, the commit time of a distributed
transaction is the same in every partition where it modified data, and a
transaction gets it snapshot timestamp from a local clock. The difference is
whether a distributed transaction gets its commit timestamp before or after the
prepare phase in 2PC."

The timeline of events also worries me. It seems unnatural to consider that
Clock-SI and the patent are independent.

2010/6 - 2010/9 One Clock-SI author worked for Microsoft Research as
an research intern
2010/10 Microsoft filed the patent
2011/9 - 2011/12 The same Clock-SI author worked for Microsoft
Research as an research intern
2013 The same author moved to EPFL and published the Clock-SI paper
with another author who has worked for Microsoft Research since then.

So, could you give your opinion whether we can use Clock-SI without
overlapping with the patent claims? I also will try to check and see, so that I
can understand your technical analysis.

And I've just noticed that I got in touch with another author of Clock-SI via SNS,
and sent an inquiry to him. I'll report again when I have a reply.

I got a reply from the main author of the Clock-SI paper:

[Reply from the Clock-SI author Jiaqing Du]
--------------------------------------------------
Thanks for reaching out.

I actually did not know that Microsoft wrote a patent which is similar to the ideas in my paper. I worked there as an intern. My Clock-SI paper was done at my school (EPFL) after my internships at Microsoft. The paper was very loosely related to my internship project at Microsoft. In a sense, the internship project at Microsoft inspired me to work on Clock-SI after I finished the internship. As you see in the paper, my coauthor, who is my internship host, is also from Microsoft, but interestingly he is not on the patent :)

Cheers,
Jiaqing
--------------------------------------------------

Unfortunately, he also did not assert that Clock-SI does not infringe on the patent. Rather, worrying words are mixed: "similar to my ideas", "loosely related", "inspired".

Also, his internship host is the co-author of the Clock-SI paper. That person should be Sameh Elnikety, who has been working for Microsoft Research. I also asked him about the same question, but he has been silent for about 10 days.

When I had a quick look, the patent appeared to be broader than Clock-SI, and Clock-SI is a concrete application of the patent. This is just my guess, but Sameh Elnikety had known the patent and set an internship theme at Microsoft or the research subject at EPFL based on it, whether he was aware or not.

As of now, it seems that the Clock-SI needs to be evaluated against the patent claims by two or more persons -- one from someone who knows Clock-SI well and implemented it for Postgres (Andrey-san?), and someone else who shares little benefit with the former person and can see it objectively.

Regards
Takayuki Tsunakawa

#44

Fujii Masao

masao.fujii@oss.nttdata.com

about 5 years ago

In reply to: Amit Kapila (#37)

On 2020/09/17 15:56, Amit Kapila wrote:

On Thu, Sep 10, 2020 at 4:20 PM Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

But it may be worth revisiting this idea if we cannot avoid the patent issue.

It is not very clear what exactly we can do about the point raised by
Tsunakawa-San related to patent in this technology as I haven't seen
that discussed during other development but maybe we can try to study
a bit. One more thing I would like to bring here is that it seems to
be there have been some concerns about this idea when originally
discussed [1]. It is not very clear to me if all the concerns are
addressed or not. If one can summarize the concerns discussed and how
the latest patch is able to address those then it will be great.

I have one concern about Clock-SI (sorry if this concern was already
discussed in the past). As far as I read the paper about Clock-SI, ISTM that
Tx2 that starts after Tx1's commit can fail to see the results by Tx1,
due to the clock skew. Please see the following example;

1. Tx1 starts at the server A.

2. Tx1 writes some records at the server A.

3. Tx1 gets the local clock 20, uses 20 as CommitTime, then completes
the commit at the server A.
This means that Tx1 is the local transaction, not distributed one.

4. Tx2 starts at the server B, i.e., the server B works as
the coordinator node for Tx2.

5. Tx2 gets the local clock 10 (i.e., it's delayed behind the server A
due to clock skew) and uses 10 as SnapshotTime at the server B.

6. Tx2 starts the remote transaction at the server A with SnapshotTime 10.

7. Tx2 doesn't need to wait due to clock skew because the imported
SnapshotTime 10 is smaller than the local clock at the server A.

8. Tx2 fails to see the records written by Tx1 at the server A because
Tx1's CommitTime 20 is larger than SnapshotTime 10.

So Tx1 was successfully committed before Tx2 starts. But, at the above example,
the subsequent transaction Tx2 fails to see the committed results.

The single PostgreSQL instance seems to guarantee that linearizability of
the transactions, but Clock-SI doesn't in the distributed env. Is this my
understanding right? Or am I missing something?

If my understanding is right, shouldn't we address that issue when using
Clock-SI? Or the patch has already addressed the issue?

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#45

Masahiko Sawada

masahiko.sawada@2ndquadrant.com

about 5 years ago

In reply to: Fujii Masao (#44)

On Thu, 15 Oct 2020 at 01:41, Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

On 2020/09/17 15:56, Amit Kapila wrote:

On Thu, Sep 10, 2020 at 4:20 PM Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

But it may be worth revisiting this idea if we cannot avoid the patent issue.

It is not very clear what exactly we can do about the point raised by
Tsunakawa-San related to patent in this technology as I haven't seen
that discussed during other development but maybe we can try to study
a bit. One more thing I would like to bring here is that it seems to
be there have been some concerns about this idea when originally
discussed [1]. It is not very clear to me if all the concerns are
addressed or not. If one can summarize the concerns discussed and how
the latest patch is able to address those then it will be great.

I have one concern about Clock-SI (sorry if this concern was already
discussed in the past). As far as I read the paper about Clock-SI, ISTM that
Tx2 that starts after Tx1's commit can fail to see the results by Tx1,
due to the clock skew. Please see the following example;

1. Tx1 starts at the server A.

2. Tx1 writes some records at the server A.

3. Tx1 gets the local clock 20, uses 20 as CommitTime, then completes
the commit at the server A.
This means that Tx1 is the local transaction, not distributed one.

4. Tx2 starts at the server B, i.e., the server B works as
the coordinator node for Tx2.

5. Tx2 gets the local clock 10 (i.e., it's delayed behind the server A
due to clock skew) and uses 10 as SnapshotTime at the server B.

6. Tx2 starts the remote transaction at the server A with SnapshotTime 10.

7. Tx2 doesn't need to wait due to clock skew because the imported
SnapshotTime 10 is smaller than the local clock at the server A.

8. Tx2 fails to see the records written by Tx1 at the server A because
Tx1's CommitTime 20 is larger than SnapshotTime 10.

So Tx1 was successfully committed before Tx2 starts. But, at the above example,
the subsequent transaction Tx2 fails to see the committed results.

The single PostgreSQL instance seems to guarantee that linearizability of
the transactions, but Clock-SI doesn't in the distributed env. Is this my
understanding right? Or am I missing something?

If my understanding is right, shouldn't we address that issue when using
Clock-SI? Or the patch has already addressed the issue?

As far as I read the paper, the above scenario can happen. I could
reproduce the above scenario with the patch. Moreover, a stale read
could happen even if Tx1 was initiated at server B (i.g., both
transactions started at the same server in sequence). In this case,
Tx1's commit timestamp would be 20 taken from server A's local clock
whereas Tx2's snapshot timestamp would be 10 same as the above case.
Therefore, in spite of both transactions were initiated at the same
server the linearizability is not provided.

Regards,

--
Masahiko Sawada http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#46

Fujii Masao

masao.fujii@oss.nttdata.com

about 5 years ago

In reply to: Masahiko Sawada (#45)

On 2020/10/23 11:58, Masahiko Sawada wrote:

On Thu, 15 Oct 2020 at 01:41, Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

On 2020/09/17 15:56, Amit Kapila wrote:

On Thu, Sep 10, 2020 at 4:20 PM Fujii Masao <masao.fujii@oss.nttdata.com> wrote:

One alternative is to add only hooks into PostgreSQL core so that we can
implement the global transaction management outside. This idea was
discussed before as the title "eXtensible Transaction Manager API".

Yeah, I read that discussion. And I remember Robert Haas and Postgres Pro people said it's not good...

But it may be worth revisiting this idea if we cannot avoid the patent issue.

It is not very clear what exactly we can do about the point raised by
Tsunakawa-San related to patent in this technology as I haven't seen
that discussed during other development but maybe we can try to study
a bit. One more thing I would like to bring here is that it seems to
be there have been some concerns about this idea when originally
discussed [1]. It is not very clear to me if all the concerns are
addressed or not. If one can summarize the concerns discussed and how
the latest patch is able to address those then it will be great.

I have one concern about Clock-SI (sorry if this concern was already
discussed in the past). As far as I read the paper about Clock-SI, ISTM that
Tx2 that starts after Tx1's commit can fail to see the results by Tx1,
due to the clock skew. Please see the following example;

1. Tx1 starts at the server A.

2. Tx1 writes some records at the server A.

3. Tx1 gets the local clock 20, uses 20 as CommitTime, then completes
the commit at the server A.
This means that Tx1 is the local transaction, not distributed one.

4. Tx2 starts at the server B, i.e., the server B works as
the coordinator node for Tx2.

5. Tx2 gets the local clock 10 (i.e., it's delayed behind the server A
due to clock skew) and uses 10 as SnapshotTime at the server B.

6. Tx2 starts the remote transaction at the server A with SnapshotTime 10.

7. Tx2 doesn't need to wait due to clock skew because the imported
SnapshotTime 10 is smaller than the local clock at the server A.

8. Tx2 fails to see the records written by Tx1 at the server A because
Tx1's CommitTime 20 is larger than SnapshotTime 10.

So Tx1 was successfully committed before Tx2 starts. But, at the above example,
the subsequent transaction Tx2 fails to see the committed results.

The single PostgreSQL instance seems to guarantee that linearizability of
the transactions, but Clock-SI doesn't in the distributed env. Is this my
understanding right? Or am I missing something?

If my understanding is right, shouldn't we address that issue when using
Clock-SI? Or the patch has already addressed the issue?

As far as I read the paper, the above scenario can happen. I could
reproduce the above scenario with the patch. Moreover, a stale read
could happen even if Tx1 was initiated at server B (i.g., both
transactions started at the same server in sequence). In this case,
Tx1's commit timestamp would be 20 taken from server A's local clock
whereas Tx2's snapshot timestamp would be 10 same as the above case.
Therefore, in spite of both transactions were initiated at the same
server the linearizability is not provided.

Yeah, so if we need to guarantee the transaction linearizability even
in distributed env (probably this is yes. Right?), using only Clock-SI
is not enough. We would need to implement something more
in addition to Clock-SI or adopt the different approach other than Clock-SI.
Thought?

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#47

tsunakawa.takay@fujitsu.com

about 5 years ago

In reply to: Fujii Masao (#46)

RE: Global snapshots

Fujii-san, Sawada-san, all,

From: Fujii Masao <masao.fujii@oss.nttdata.com>

Yeah, so if we need to guarantee the transaction linearizability even
in distributed env (probably this is yes. Right?), using only Clock-SI
is not enough. We would need to implement something more
in addition to Clock-SI or adopt the different approach other than Clock-SI.
Thought?

Could you please try interpreting MVCO and see if we have any hope in this? This doesn't fit in my small brain. I'll catch up with understanding this when I have time.

MVCO - Technical report - IEEE RIDE-IMS 93 (PDF; revised version of DEC-TR 853)
https://sites.google.com/site/yoavraz2/MVCO-WDE.pdf

MVCO is a multiversion member of Commitment Ordering algorithms described below:

Commitment ordering (CO) - yoavraz2
https://sites.google.com/site/yoavraz2/the_principle_of_co

Commitment ordering - Wikipedia
https://en.wikipedia.org/wiki/Commitment_ordering

Related patents are as follows. The last one is MVCO.

US5504900A - Commitment ordering for guaranteeing serializability across distributed transactions
https://patents.google.com/patent/US5504900A/en?oq=US5504900

US5504899A - Guaranteeing global serializability by applying commitment ordering selectively to global transactions
https://patents.google.com/patent/US5504899A/en?oq=US5504899

US5701480A - Distributed multi-version commitment ordering protocols for guaranteeing serializability during transaction processing
https://patents.google.com/patent/US5701480A/en?oq=US5701480

Regards
Takayuki Tsunakawa

#48

tsunakawa.takay@fujitsu.com

about 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#47)

RE: Global snapshots

Hello,

Fujii-san and I discussed how to move the scale-out development forward. We are both worried that Clock-SI is (highly?) likely to infringe the said Microsoft's patent. So we agreed we are going to investigate the Clock-SI and the patent, and if we have to conclude that we cannot embrace Clock-SI, we will explore other possibilities.

IMO, it seems that Clock-SI overlaps with the patent and we can't use it. First, looking back how to interpret the patent document, patent "claims" are what we should pay our greatest attention. According to the following citation from the IP guide by Software Freedom Law Center (SFLC) [1]A Legal Issues Primer for Open Source and Free Software Projects https://www.softwarefreedom.org/resources/2008/foss-primer.pdf, software infringes a patent if it implements everything of any claim, not all claims.

--------------------------------------------------
4.2 Patent Infringement
To prove that you5 infringe a patent, the patent holder must show that you make, use, offer to sell, or sell the invention as it is defined in at least one claim of the patent.

For software to infringe a patent, the software essentially must implement everything recited in one of the patent�fs claims. It is crucial to recognize that infringement is based directly on the claims of the patent, and not on what is stated or described in other parts of the patent document.
--------------------------------------------------

And, Clock-SI implements at least claims 11 and 20 cited below. It doesn't matter whether Clock-SI uses a physical clock or logical one.

--------------------------------------------------
11. A method comprising:
receiving information relating to a distributed database transaction operating on data in data stores associated with respective participating nodes associated with the distributed database transaction;
requesting commit time votes from the respective participating nodes, the commit time votes reflecting local clock values of the respective participating nodes;
receiving the commit time votes from the respective participating nodes in response to the requesting;
computing a global commit timestamp for the distributed database transaction based at least in part on the commit time votes, the global commit timestamp reflecting a maximum value of the commit time votes received from the respective participating nodes; and
synchronizing commitment of the distributed database transaction at the respective participating nodes to the global commit timestamp,
wherein at least the computing is performed by a computing device.

20. A method for managing a distributed database transaction, the method comprising:
receiving information relating to the distributed database transaction from a transaction coordinator associated with the distributed database transaction;
determining a commit time vote for the distributed database transaction based at least in part on a local clock;
communicating the commit time vote for the distributed database transaction to the transaction coordinator;
receiving a global commit timestamp from the transaction coordinator;
synchronizing commitment of the distributed database transaction to the global commit timestamp;
receiving a remote request from a requesting database node corresponding to the distributed database transaction;
creating a local transaction corresponding to the distributed database transaction;
compiling a list of database nodes involved in generating a result of the local transaction and access types utilized by respective database nodes in the list of database nodes; and
returning the list of database nodes and the access types to the requesting database node in response to the remote request,
wherein at least the compiling is performed by a computing device.
--------------------------------------------------

My question is that the above claims appear to cover somewhat broad range. I wonder if other patents or unpatented technologies overlap with this kind of description.

Thoughts?

[1]: A Legal Issues Primer for Open Source and Free Software Projects https://www.softwarefreedom.org/resources/2008/foss-primer.pdf
A Legal Issues Primer for Open Source and Free Software Projects
https://www.softwarefreedom.org/resources/2008/foss-primer.pdf

[2]: US8356007B2 - Distributed transaction management for database systems with multiversioning - Google Patents https://patents.google.com/patent/US8356007
US8356007B2 - Distributed transaction management for database systems with multiversioning - Google Patents
https://patents.google.com/patent/US8356007

Regards
Takayuki Tsunakawa

#49

Fujii Masao

masao.fujii@oss.nttdata.com

about 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#48)

On 2021/01/01 12:14, tsunakawa.takay@fujitsu.com wrote:

Hello,

Fujii-san and I discussed how to move the scale-out development forward. We are both worried that Clock-SI is (highly?) likely to infringe the said Microsoft's patent. So we agreed we are going to investigate the Clock-SI and the patent, and if we have to conclude that we cannot embrace Clock-SI, we will explore other possibilities.

Yes.

IMO, it seems that Clock-SI overlaps with the patent and we can't use it. First, looking back how to interpret the patent document, patent "claims" are what we should pay our greatest attention. According to the following citation from the IP guide by Software Freedom Law Center (SFLC) [1], software infringes a patent if it implements everything of any claim, not all claims.

--------------------------------------------------
4.2 Patent Infringement
To prove that you5 infringe a patent, the patent holder must show that you make, use, offer to sell, or sell the invention as it is defined in at least one claim of the patent.

For software to infringe a patent, the software essentially must implement everything recited in one of the patent�fs claims. It is crucial to recognize that infringement is based directly on the claims of the patent, and not on what is stated or described in other parts of the patent document.
--------------------------------------------------

And, Clock-SI implements at least claims 11 and 20 cited below. It doesn't matter whether Clock-SI uses a physical clock or logical one.

Thanks for sharing the result of your investigation!

Regarding at least claim 11, I reached the same conclusion. As far as
I understand correctly, Clock-SI actually does the method described
at the claim 11 when determing the commit time and doing the commit
on each node.

I don't intend to offend Clock-SI and any activities based on that. OTOH,
I'm now wondering if it's worth considering another approach for global
transaction support, while I'm still interested in Clock-SI technically.

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION

#50

tsunakawa.takay@fujitsu.com

almost 5 years ago

In reply to: Fujii Masao (#49)

RE: Global snapshots

Hello, Andrey-san, all,

Based on the request at HighGo's sharding meeting, I'm re-sending the information on Commitment Ordering that could be used for global visibility. Their patents have already expired.

--------------------------------------------------
Have anyone examined the following Multiversion Commitment Ordering (MVCO)? Although I haven't understood this yet, it insists that no concurrency control information including timestamps needs to be exchanged among the cluster nodes. I'd appreciate it if someone could give an opinion.

--------------------------------------------------
* Or, maybe we can use the following Commitment ordering that doesn't require the timestamp or any other information to be transferred among the cluster nodes. However, this seems to have to track the order of read and write operations among concurrent transactions to ensure the correct commit order, so I'm not sure about the performance. The MVCO paper seems to present the information we need, but I haven't understood it well yet (it's difficult.) Could you anybody kindly interpret this?

Commitment ordering (CO) - yoavraz2
https://sites.google.com/site/yoavraz2/the_principle_of_co

--------------------------------------------------
Could you please try interpreting MVCO and see if we have any hope in this? This doesn't fit in my small brain. I'll catch up with understanding this when I have time.

MVCO - Technical report - IEEE RIDE-IMS 93 (PDF; revised version of DEC-TR 853)
https://sites.google.com/site/yoavraz2/MVCO-WDE.pdf

MVCO is a multiversion member of Commitment Ordering algorithms described below:

Commitment ordering (CO) - yoavraz2
https://sites.google.com/site/yoavraz2/the_principle_of_co

Commitment ordering - Wikipedia
https://en.wikipedia.org/wiki/Commitment_ordering

Related patents are as follows. The last one is MVCO.

US5504900A - Commitment ordering for guaranteeing serializability across distributed transactions
https://patents.google.com/patent/US5504900A/en?oq=US5504900

US5504899A - Guaranteeing global serializability by applying commitment ordering selectively to global transactions
https://patents.google.com/patent/US5504899A/en?oq=US5504899

US5701480A - Distributed multi-version commitment ordering protocols for guaranteeing serializability during transaction processing
https://patents.google.com/patent/US5701480A/en?oq=US5701480

Regards
Takayuki Tsunakawa

#51

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

almost 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#48)

On 1/1/21 8:14 AM, tsunakawa.takay@fujitsu.com wrote:

--------------------------------------------------
11. A method comprising:
receiving information relating to a distributed database transaction operating on data in data stores associated with respective participating nodes associated with the distributed database transaction;
requesting commit time votes from the respective participating nodes, the commit time votes reflecting local clock values of the respective participating nodes;
receiving the commit time votes from the respective participating nodes in response to the requesting;
computing a global commit timestamp for the distributed database transaction based at least in part on the commit time votes, the global commit timestamp reflecting a maximum value of the commit time votes received from the respective participating nodes; and
synchronizing commitment of the distributed database transaction at the respective participating nodes to the global commit timestamp,
wherein at least the computing is performed by a computing device.

Thank you for this analysis of the patent.
After researching in depth, I think this is the real problem.
My idea was that we are not using real clocks, we only use clock ticks
to measure time intervals. It can also be interpreted as a kind of clock.

That we can do:
1. Use global clocks at the start of transaction.
2. Use CSN-based snapshot as a machinery and create an extension to
allow user defined commit protocols.

--
regards,
Andrey Lepikhov
Postgres Professional

#52

tsunakawa.takay@fujitsu.com

almost 5 years ago

In reply to: Andrey V. Lepikhov (#51)

RE: Global snapshots

From: Andrey V. Lepikhov <a.lepikhov@postgrespro.ru>

After researching in depth, I think this is the real problem.
My idea was that we are not using real clocks, we only use clock ticks to
measure time intervals. It can also be interpreted as a kind of clock.

Yes, patent claims tend to be written to cover broad interpretation. That's too sad.

That we can do:
1. Use global clocks at the start of transaction.
2. Use CSN-based snapshot as a machinery and create an extension to allow
user defined commit protocols.

Is this your suggestion to circumvent the patent? Sorry, I'm afraid I can't understand it yet (I have to study more.) I hope others will comment on this.

Regards
Takayuki Tsunakawa

#53

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

almost 5 years ago

In reply to: tsunakawa.takay@fujitsu.com (#52)

4 attachment(s)

Current state of the patch set rebased on master, 5aed6a1fc2.

It is development version. Here some problems with visibility still
detected in two tests:
1. CSN Snapshot module - TAP test on time skew.
2. Clock SI implementation - TAP test on emulation of bank transaction.

--
regards,
Andrey Lepikhov
Postgres Professional

Attachments:

0001-CSN-Log.patchtext/x-patch; charset=UTF-8; name=0001-CSN-Log.patchDownload

From 875606471aaae9ee422593da08be40f5db4305dc Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Mon, 22 Mar 2021 15:56:06 +0500
Subject: [PATCH 1/4] CSN Log

---
 src/backend/access/rmgrdesc/Makefile     |   1 +
 src/backend/access/rmgrdesc/csnlogdesc.c |  95 ++++
 src/backend/access/rmgrdesc/xlogdesc.c   |   6 +-
 src/backend/access/transam/Makefile      |   1 +
 src/backend/access/transam/csn_log.c     | 685 +++++++++++++++++++++++
 src/backend/access/transam/rmgr.c        |   1 +
 src/backend/access/transam/varsup.c      |   2 +
 src/backend/access/transam/xlog.c        |  21 +-
 src/backend/commands/vacuum.c            |   3 +-
 src/backend/storage/ipc/ipci.c           |   3 +
 src/backend/storage/ipc/procarray.c      |   3 +
 src/backend/storage/lmgr/lwlock.c        |   2 +
 src/backend/storage/lmgr/lwlocknames.txt |   1 +
 src/backend/storage/sync/sync.c          |   5 +
 src/backend/utils/misc/guc.c             |  18 +
 src/backend/utils/probes.d               |   2 +
 src/bin/initdb/initdb.c                  |   3 +-
 src/bin/pg_controldata/pg_controldata.c  |   2 +
 src/bin/pg_upgrade/pg_upgrade.c          |   5 +
 src/bin/pg_upgrade/pg_upgrade.h          |   2 +
 src/bin/pg_waldump/rmgrdesc.c            |   1 +
 src/include/access/csn_log.h             |  83 +++
 src/include/access/rmgrlist.h            |   1 +
 src/include/access/xlog_internal.h       |   1 +
 src/include/catalog/pg_control.h         |   3 +
 src/include/storage/lwlock.h             |   1 +
 src/include/storage/proc.h               |   1 +
 src/include/storage/sync.h               |   1 +
 src/include/utils/snapshot.h             |  11 +
 src/test/regress/expected/sysviews.out   |   4 +-
 30 files changed, 962 insertions(+), 6 deletions(-)
 create mode 100644 src/backend/access/rmgrdesc/csnlogdesc.c
 create mode 100644 src/backend/access/transam/csn_log.c
 create mode 100644 src/include/access/csn_log.h

diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index f88d72fd86..15fc36f7b4 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -11,6 +11,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	brindesc.o \
 	clogdesc.o \
+	csnlogdesc.o \
 	committsdesc.o \
 	dbasedesc.o \
 	genericdesc.o \
diff --git a/src/backend/access/rmgrdesc/csnlogdesc.c b/src/backend/access/rmgrdesc/csnlogdesc.c
new file mode 100644
index 0000000000..0f52b2986d
--- /dev/null
+++ b/src/backend/access/rmgrdesc/csnlogdesc.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * clogdesc.c
+ *	  rmgr descriptor routines for access/transam/csn_log.c
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/rmgrdesc/csnlogdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+
+
+void
+csnlog_desc(StringInfo buf, XLogReaderState *record)
+{
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		appendStringInfo(buf, "assign "INT64_FORMAT"", csn);
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) rec;
+		int			  nsubxids;
+
+		appendStringInfo(buf, "set "INT64_FORMAT" for: %u",
+						 xlrec->csn,
+						 xlrec->xtop);
+		nsubxids = ((XLogRecGetDataLen(record) - MinSizeOfCSNSet) /
+					sizeof(TransactionId));
+		if (nsubxids > 0)
+		{
+			int			i;
+			TransactionId *subxids;
+
+			subxids = palloc(sizeof(TransactionId) * nsubxids);
+			memcpy(subxids,
+				   XLogRecGetData(record) + MinSizeOfCSNSet,
+				   sizeof(TransactionId) * nsubxids);
+			for (i = 0; i < nsubxids; i++)
+				appendStringInfo(buf, ", %u", subxids[i]);
+			pfree(subxids);
+		}
+	}
+}
+
+const char *
+csnlog_identify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_CSN_ASSIGNMENT:
+			id = "ASSIGNMENT";
+			break;
+		case XLOG_CSN_SETCSN:
+			id = "SETCSN";
+			break;
+		case XLOG_CSN_ZEROPAGE:
+			id = "ZEROPAGE";
+			break;
+		case XLOG_CSN_TRUNCATE:
+			id = "TRUNCATE";
+			break;
+	}
+
+	return id;
+}
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index e6090a9dad..aeba6db37d 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -113,7 +113,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		appendStringInfo(buf, "max_connections=%d max_worker_processes=%d "
 						 "max_wal_senders=%d max_prepared_xacts=%d "
 						 "max_locks_per_xact=%d wal_level=%s "
-						 "wal_log_hints=%s track_commit_timestamp=%s",
+						 "wal_log_hints=%s track_commit_timestamp=%s "
+						 "enable_csn_snapshot=%s",
 						 xlrec.MaxConnections,
 						 xlrec.max_worker_processes,
 						 xlrec.max_wal_senders,
@@ -121,7 +122,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 xlrec.max_locks_per_xact,
 						 wal_level_str,
 						 xlrec.wal_log_hints ? "on" : "off",
-						 xlrec.track_commit_timestamp ? "on" : "off");
+						 xlrec.track_commit_timestamp ? "on" : "off",
+						 xlrec.enable_csn_snapshot ? "on" : "off");
 	}
 	else if (info == XLOG_FPW_CHANGE)
 	{
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de72..d1196cba87 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	csn_log.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c
new file mode 100644
index 0000000000..7a6d63e953
--- /dev/null
+++ b/src/backend/access/transam/csn_log.c
@@ -0,0 +1,685 @@
+/*-----------------------------------------------------------------------------
+ *
+ * csn_log.c
+ *		Track commit sequence numbers of finished transactions
+ *
+ * This module provides SLRU to store CSN for each transaction.  This
+ * mapping need to be kept only for xid's greater then oldestXid, but
+ * that can require arbitrary large amounts of memory in case of long-lived
+ * transactions.  Because of same lifetime and persistancy requirements
+ * this module is quite similar to subtrans.c
+ *
+ * If we switch database from CSN-base snapshot to xid-base snapshot then,
+ * nothing wrong. But if we switch xid-base snapshot to CSN-base snapshot
+ * it should decide a new xid whwich begin csn-base check. It can not be
+ * oldestActiveXID because of prepared transaction.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xlog_internal.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "storage/spin.h"
+#include "storage/shmem.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * We use csnActive to judge if csn snapshot enabled instead of by
+ * enable_csn_snapshot, this design is similar to 'track_commit_timestamp'.
+ *
+ * Because in process of replication if master change 'enable_csn_snapshot'
+ * in a database restart, standby should apply wal record for GUC changed,
+ * then it's difficult to notice all backends about that. So they can get
+ * the message by 'csnActive' which in share buffer. It will not
+ * acquire a lock, so without performance issue.
+ *
+ */
+typedef struct CSNShared
+{
+	bool				csnActive;
+	CSN					last_csn_log_wal; /* for interval we log the assign csn to wal */
+	CSN					last_max_csn; /* Record the max csn till now */
+	TransactionId		xmin_for_csn; /*'xmin_for_csn' for when turn xid-snapshot to csn-snapshot*/
+	volatile slock_t	lock;
+} CSNShared;
+
+
+static CSNShared *csnShared;
+
+bool enable_csn_snapshot;
+bool enable_csn_wal;
+
+/*
+ * Defines for CSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/CSN_LOG_XACTS_PER_PAGE, and CSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCSNLog (see CSNLogPagePrecedes).
+ */
+
+/* We store the commit CSN for each xid */
+#define CSN_LOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CSN))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+
+/* Link to shared-memory data structures for CLOG control */
+static SlruCtlData CSNLogCtlData;
+#define CsnlogCtl (&CSNLogCtlData)
+
+
+static int	ZeroCSNLogPage(int pageno, bool write_xlog);
+static void ZeroTruncateCSNLogPage(int pageno, bool write_xlog);
+static bool CSNLogPagePrecedes(int page1, int page2);
+static void CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+									  TransactionId *subxids,
+									  CSN csn, int pageno);
+static void CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno);
+
+static void WriteCSNXlogRec(TransactionId xid, int nsubxids,
+							TransactionId *subxids, CSN csn);
+static void WriteZeroCSNPageXlogRec(int pageno);
+static void WriteTruncateCSNXlogRec(int pageno);
+static void set_last_log_wal_csn(CSN csn);
+static CSN get_last_log_wal_csn(void);
+
+
+/*
+ * CSNLogSetCSN
+ *
+ * Record CSN of transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transactionid for a top level commit or abort. It can also be a
+ * subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * AbortedCSN for abort cases.
+ */
+void
+CSNLogSetCSN(TransactionId xid, int nsubxids, TransactionId *subxids, CSN csn,
+			 bool write_xlog)
+{
+	int pageno;
+	int i = 0;
+	int offset = 0;
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);		/* get page of parent */
+
+	if(write_xlog)
+		WriteCSNXlogRec(xid, nsubxids, subxids, csn);
+
+	for (;;)
+	{
+		int num_on_page = 0;
+
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		CSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							csn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page. Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids,
+						   CSN csn, int pageno)
+{
+	int slotno;
+	int i;
+
+	LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(CsnlogCtl->shared->page_number[slotno] ==
+											TransactionIdToPage(subxids[i]));
+		CSNLogSetCSNInSlot(subxids[i], csn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		CSNLogSetCSNInSlot(xid, csn, slotno);
+
+	CsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno)
+{
+	int entryno = TransactionIdToPgIndex(xid);
+	CSN *ptr;
+
+	Assert(LWLockHeldByMe(CSNLogControlLock));
+
+	ptr = (CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetCSN() in csn_snapshot.c is the
+ * intended caller.
+ */
+CSN
+CSNLogGetCSNByXid(TransactionId xid)
+{
+	int pageno = TransactionIdToPage(xid);
+	int entryno = TransactionIdToPgIndex(xid);
+	int slotno;
+	CSN csn;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid);
+	csn = *(CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+
+	LWLockRelease(CSNLogControlLock);
+
+	return csn;
+}
+
+/*
+ * Number of shared CSNLog buffers.
+ */
+static Size
+CSNLogShmemBuffers(void)
+{
+	return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for CsnlogCtl.
+ */
+Size
+CSNLogShmemSize(void)
+{
+	return SimpleLruShmemSize(CSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for CSNLog.
+ */
+void
+CSNLogShmemInit(void)
+{
+	bool found;
+
+	CsnlogCtl->PagePrecedes = CSNLogPagePrecedes;
+	SimpleLruInit(CsnlogCtl, "CSNLog Ctl", CSNLogShmemBuffers(), 0,
+				  CSNLogControlLock, "pg_csn", LWTRANCHE_CSN_LOG_BUFFERS,
+				  SYNC_HANDLER_CSN_LOG);
+
+	csnShared = ShmemInitStruct("CSNlog shared",
+								sizeof(CSNShared),
+								&found);
+	if (!found)
+	{
+		set_last_max_csn(InvalidCSN, true);
+		csnShared->last_csn_log_wal = InvalidCSN;
+		csnShared->xmin_for_csn = InvalidTransactionId;
+		csnShared->csnActive = false;
+		SpinLockInit(&csnShared->lock);
+	}
+}
+
+/*
+ * Initialize (or reinitialize) a page of CSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCSNLogPage(int pageno, bool write_xlog)
+{
+	int slotno;
+
+	Assert(LWLockHeldByMe(CSNLogControlLock));
+
+	slotno = SimpleLruZeroPage(CsnlogCtl, pageno);
+
+	if(write_xlog)
+		WriteZeroCSNPageXlogRec(pageno);
+
+	return slotno;
+}
+
+static void
+ZeroTruncateCSNLogPage(int pageno, bool write_xlog)
+{
+	if(write_xlog)
+		WriteTruncateCSNXlogRec(pageno);
+
+	SimpleLruTruncate(CsnlogCtl, pageno);
+}
+
+void
+ActivateCSNlog(void)
+{
+	int				startPage;
+	TransactionId	nextXid = InvalidTransactionId;
+
+	if (csnShared->csnActive)
+		return;
+
+	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	startPage = TransactionIdToPage(nextXid);
+
+	/* Create the current segment file, if necessary */
+	if (!SimpleLruDoesPhysicalPageExist(CsnlogCtl, startPage))
+	{
+		int			slotno;
+		LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+		slotno = ZeroCSNLogPage(startPage, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+		LWLockRelease(CSNLogControlLock);
+	}
+	csnShared->csnActive = true;
+}
+
+bool
+get_csnlog_status(void)
+{
+	if(!csnShared)
+	{
+		/* Should not arrived */
+		elog(ERROR, "We do not have csnShared point");
+	}
+	return csnShared->csnActive;
+}
+
+void
+DeactivateCSNlog(void)
+{
+	csnShared->csnActive = false;
+	LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+	(void) SlruScanDirectory(CsnlogCtl, SlruScanDirCbDeleteAll, NULL);
+	LWLockRelease(CSNLogControlLock);
+}
+
+void
+StartupCSN(void)
+{
+	ActivateCSNlog();
+}
+
+void
+CompleteCSNInitialization(void)
+{
+	/*
+	 * If the feature is not enabled, turn it off for good.  This also removes
+	 * any leftover data.
+	 *
+	 * Conversely, we activate the module if the feature is enabled.  This is
+	 * necessary for primary and standby as the activation depends on the
+	 * control file contents at the beginning of recovery or when a
+	 * XLOG_PARAMETER_CHANGE is replayed.
+	 */
+	if (!get_csnlog_status())
+		DeactivateCSNlog();
+	else
+		ActivateCSNlog();
+}
+
+void
+CSNlogParameterChange(bool newvalue, bool oldvalue)
+{
+	if (newvalue)
+	{
+		if (!csnShared->csnActive)
+			ActivateCSNlog();
+	}
+	else if (csnShared->csnActive)
+		DeactivateCSNlog();
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCSNLog(void)
+{
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * Flush dirty CSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true);
+	SimpleLruWriteAll(CsnlogCtl, true);
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that CSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCSNLog(TransactionId newestXact)
+{
+	int			pageno;
+
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCSNLogPage(pageno, !InRecovery);
+
+	LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * Remove all CSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateCSNLog(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+	ZeroTruncateCSNLogPage(cutoffPage, true);
+}
+
+/*
+ * Decide which of two CSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+CSNLogPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * CSN_LOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * CSN_LOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
+
+void
+WriteAssignCSNXlogRec(CSN csn)
+{
+	CSN log_csn = 0;
+
+	if (!enable_csn_wal || csn <= get_last_log_wal_csn())
+		return;
+
+	/*
+	 * We log the CSN 5s greater than generated, you can see comments on
+	 * CSN_ASSIGN_TIME_INTERVAL define.
+	 */
+	log_csn = CSNAddByNanosec(csn, CSN_ASSIGN_TIME_INTERVAL);
+	set_last_log_wal_csn(log_csn);
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&log_csn), sizeof(CSN));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ASSIGNMENT);
+}
+
+static void
+WriteCSNXlogRec(TransactionId xid, int nsubxids,
+				TransactionId *subxids, CSN csn)
+{
+	xl_csn_set xlrec;
+
+	if (!enable_csn_wal)
+		return;
+
+	xlrec.xtop = xid;
+	xlrec.nsubxacts = nsubxids;
+	xlrec.csn = csn;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, MinSizeOfCSNSet);
+	XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId));
+	(void) XLogInsert(RM_CSNLOG_ID, XLOG_CSN_SETCSN);
+}
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroCSNPageXlogRec(int pageno)
+{
+	if (!enable_csn_wal)
+		return;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ */
+static void
+WriteTruncateCSNXlogRec(int pageno)
+{
+	if (!enable_csn_wal)
+		return;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_CSNLOG_ID, XLOG_CSN_TRUNCATE);
+}
+
+void
+csnlog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in csnlog records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		set_last_max_csn(csn, true);
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) XLogRecGetData(record);
+		CSNLogSetCSN(xlrec->xtop, xlrec->nsubxacts, xlrec->xsub, xlrec->csn, false);
+	}
+	else if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+		slotno = ZeroCSNLogPage(pageno, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+		LWLockRelease(CSNLogControlLock);
+		Assert(!CsnlogCtl->shared->page_dirty[slotno]);
+
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		CsnlogCtl->shared->latest_page_number = pageno;
+		ZeroTruncateCSNLogPage(pageno, false);
+	}
+	else
+		elog(PANIC, "csnlog_redo: unknown op code %u", info);
+}
+
+static void
+set_last_log_wal_csn(CSN csn)
+{
+	SpinLockAcquire(&csnShared->lock);
+	csnShared->last_csn_log_wal = csn;
+	SpinLockRelease(&csnShared->lock);
+}
+
+static CSN
+get_last_log_wal_csn(void)
+{
+	CSN csn;
+
+	SpinLockAcquire(&csnShared->lock);
+	csn = csnShared->last_csn_log_wal;
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+CSN
+set_last_max_csn(CSN csn, bool force)
+{
+	SpinLockAcquire(&csnShared->lock);
+	if (csn <= csnShared->last_max_csn && !force)
+		csn = csnShared->last_max_csn + 1;
+
+	csnShared->last_max_csn = csn;
+
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+CSN
+get_last_max_csn(void)
+{
+	CSN csn;
+
+	SpinLockAcquire(&csnShared->lock);
+	csn = csnShared->last_max_csn;
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+CSN
+get_xmin_for_csn(void)
+{
+	CSN csn;
+
+	SpinLockAcquire(&csnShared->lock);
+	csn = csnShared->xmin_for_csn;
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+void
+prepare_csn_env(bool enable)
+{
+	if (enable)
+	{
+		TransactionId nextxid =
+						XidFromFullTransactionId(ShmemVariableCache->nextXid);
+		/* 'xmin_for_csn' for when turn xid-snapshot to csn-snapshot */
+		csnShared->xmin_for_csn = nextxid;
+		/* produce the csnlog segment we want now and seek to current page */
+		ActivateCSNlog();
+	}
+	else
+		/* Try to drop all csnlog seg */
+		DeactivateCSNlog();
+}
+
+/*
+ * Entrypoint for sync.c to sync csnlog files.
+ */
+int
+csnlogsyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(CsnlogCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..b1e5ec350e 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -28,6 +28,7 @@
 #include "replication/origin.h"
 #include "storage/standby.h"
 #include "utils/relmapper.h"
+#include "access/csn_log.h"
 
 /* must be kept in sync with RmgrData definition in xlog_internal.h */
 #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 142da4aaff..5791a2e66e 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -175,6 +176,7 @@ GetNewTransactionId(bool isSubXact)
 	 * Extend pg_subtrans and pg_commit_ts too.
 	 */
 	ExtendCLOG(xid);
+	ExtendCSNLog(xid);
 	ExtendCommitTs(xid);
 	ExtendSUBTRANS(xid);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6f8810e149..46f016dce7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -4649,6 +4650,7 @@ InitControlFile(uint64 sysidentifier)
 	ControlFile->wal_level = wal_level;
 	ControlFile->wal_log_hints = wal_log_hints;
 	ControlFile->track_commit_timestamp = track_commit_timestamp;
+	ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
 }
 
@@ -6910,6 +6912,7 @@ StartupXLOG(void)
 	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
 					 checkPoint.newestCommitTsXid);
 	XLogCtl->ckptFullXid = checkPoint.nextXid;
+	set_last_max_csn(checkPoint.oldestUsedCSN, true);
 
 	/*
 	 * Initialize replication slots, before there's a chance to remove
@@ -6945,6 +6948,9 @@ StartupXLOG(void)
 	if (ControlFile->track_commit_timestamp)
 		StartupCommitTs();
 
+	if(ControlFile->enable_csn_snapshot)
+		StartupCSN();
+
 	/*
 	 * Recover knowledge about replay progress of known replication partners.
 	 */
@@ -8062,6 +8068,7 @@ StartupXLOG(void)
 	 * commit timestamp.
 	 */
 	CompleteCommitTsInitialization();
+	CompleteCSNInitialization();
 
 	/*
 	 * All done with end-of-recovery actions.
@@ -8981,6 +8988,8 @@ CreateCheckPoint(int flags)
 	 */
 	last_important_lsn = GetLastImportantRecPtr();
 
+	checkPoint.oldestUsedCSN = get_last_max_csn();
+
 	/*
 	 * We must block concurrent insertions while examining insert state to
 	 * determine the checkpoint REDO pointer.
@@ -9370,6 +9379,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	CheckPointCLOG();
+	CheckPointCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -9922,7 +9932,8 @@ XLogReportParameters(void)
 		max_wal_senders != ControlFile->max_wal_senders ||
 		max_prepared_xacts != ControlFile->max_prepared_xacts ||
 		max_locks_per_xact != ControlFile->max_locks_per_xact ||
-		track_commit_timestamp != ControlFile->track_commit_timestamp)
+		track_commit_timestamp != ControlFile->track_commit_timestamp ||
+		enable_csn_snapshot != ControlFile->enable_csn_snapshot)
 	{
 		/*
 		 * The change in number of backend slots doesn't need to be WAL-logged
@@ -9944,6 +9955,7 @@ XLogReportParameters(void)
 			xlrec.wal_level = wal_level;
 			xlrec.wal_log_hints = wal_log_hints;
 			xlrec.track_commit_timestamp = track_commit_timestamp;
+			xlrec.enable_csn_snapshot = enable_csn_snapshot;
 
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
@@ -9954,6 +9966,9 @@ XLogReportParameters(void)
 
 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
+		if (enable_csn_snapshot != ControlFile->enable_csn_snapshot)
+			prepare_csn_env(enable_csn_snapshot);
+
 		ControlFile->MaxConnections = MaxConnections;
 		ControlFile->max_worker_processes = max_worker_processes;
 		ControlFile->max_wal_senders = max_wal_senders;
@@ -9962,6 +9977,7 @@ XLogReportParameters(void)
 		ControlFile->wal_level = wal_level;
 		ControlFile->wal_log_hints = wal_log_hints;
 		ControlFile->track_commit_timestamp = track_commit_timestamp;
+		ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 		UpdateControlFile();
 
 		LWLockRelease(ControlFileLock);
@@ -10393,6 +10409,9 @@ xlog_redo(XLogReaderState *record)
 		CommitTsParameterChange(xlrec.track_commit_timestamp,
 								ControlFile->track_commit_timestamp);
 		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+		CSNlogParameterChange(xlrec.enable_csn_snapshot,
+							  ControlFile->enable_csn_snapshot);
+		ControlFile->enable_csn_snapshot = xlrec.enable_csn_snapshot;
 
 		UpdateControlFile();
 		LWLockRelease(ControlFileLock);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index c064352e23..515f2b3eb9 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -53,7 +53,7 @@
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
-
+#include "access/csn_log.h"
 
 /*
  * GUC parameters
@@ -1688,6 +1688,7 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	TruncateCLOG(frozenXID, oldestxid_datoid);
 	TruncateCommitTs(frozenXID);
+	TruncateCSNLog(frozenXID);
 	TruncateMultiXact(minMulti, minmulti_datoid);
 
 	/*
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3e4ec53a97..e06bc13841 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -127,6 +128,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, ProcGlobalShmemSize());
 		size = add_size(size, XLOGShmemSize());
 		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, CSNLogShmemSize());
 		size = add_size(size, CommitTsShmemSize());
 		size = add_size(size, SUBTRANSShmemSize());
 		size = add_size(size, TwoPhaseShmemSize());
@@ -218,6 +220,7 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	CSNLogShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 4fc6ffb917..4a200b0815 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -48,6 +48,7 @@
 #include <signal.h>
 
 #include "access/clog.h"
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -1170,6 +1171,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
 	{
 		ExtendSUBTRANS(latestObservedXid);
+		ExtendCSNLog(latestObservedXid);
 		TransactionIdAdvance(latestObservedXid);
 	}
 	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
@@ -4320,6 +4322,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
+			ExtendCSNLog(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 975d547f34..3489e51ef3 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -134,6 +134,8 @@ static const char *const BuiltinTrancheNames[] = {
 	"CommitTSBuffer",
 	/* LWTRANCHE_SUBTRANS_BUFFER: */
 	"SubtransBuffer",
+	/* LWTRANCHE_CSN_LOG_BUFFERS */
+	"CSNLogBuffer",
 	/* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
 	"MultiXactOffsetBuffer",
 	/* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c295..56196b2d0d 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+CSNLogControlLock				    48
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 708215614d..cbb2fd510b 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -20,6 +20,7 @@
 
 #include "access/commit_ts.h"
 #include "access/clog.h"
+#include "access/csn_log.h"
 #include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
@@ -118,6 +119,10 @@ static const SyncOps syncsw[] = {
 	/* pg_multixact/members */
 	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
 		.sync_syncfiletag = multixactmemberssyncfiletag
+	},
+	/* CSN_Log */
+	[SYNC_HANDLER_CSN_LOG] = {
+		.sync_syncfiletag = csnlogsyncfiletag
 	}
 };
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3b36a31a47..14063bcf9e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1170,6 +1170,24 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_csn_snapshot", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-base snapshot."),
+			gettext_noop("Used to achieve REPEATEBLE READ isolation level for postgres_fdw transactions.")
+		},
+		&enable_csn_snapshot,
+		false,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_csn_wal", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-wal record."),
+			gettext_noop("Used to enable csn-wal record")
+		},
+		&enable_csn_wal,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"ssl", PGC_SIGHUP, CONN_AUTH_SSL,
 			gettext_noop("Enables SSL connections."),
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index b0c50a3c7f..3fcd0f4ccf 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe csnlog__checkpoint__start(bool);
+	probe csnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 3c1cf78b4f..6b3b9ff504 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -223,7 +223,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_csn"
 };
 
 
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index f911f98d94..325e6a0e2b 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -300,6 +300,8 @@ main(int argc, char *argv[])
 		   ControlFile->max_locks_per_xact);
 	printf(_("track_commit_timestamp setting:       %s\n"),
 		   ControlFile->track_commit_timestamp ? _("on") : _("off"));
+	printf(_("enable_csn_snapshot setting:    	    %s\n"),
+		   ControlFile->enable_csn_snapshot ? 	 _("on") : _("off"));
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile->maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index e23b8ca88d..ed9112ed40 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -541,6 +541,11 @@ copy_xact_xlog_xid(void)
 		check_ok();
 	}
 
+	if(old_cluster.controldata.cat_ver > CSN_BASE_SNAPSHOT_ADD_VER)
+	{
+		copy_subdir_files("pg_csn", "pg_csn");
+	}
+
 	/* now reset the wal archives in the new cluster */
 	prep_status("Resetting WAL archives");
 	exec_prog(UTILITY_LOG_FILE, NULL, true, true,
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 919a7849fd..77f5a030ae 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -123,6 +123,8 @@ extern char *output_files[];
  */
 #define JSONB_FORMAT_CHANGE_CAT_VER 201409291
 
+#define	CSN_BASE_SNAPSHOT_ADD_VER	202002010
+
 
 /*
  * Each relation is represented by a relinfo structure.
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..2d280ce940 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -11,6 +11,7 @@
 #include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/generic_xlog.h"
 #include "access/ginxlog.h"
 #include "access/gistxlog.h"
diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h
new file mode 100644
index 0000000000..01fb0daad4
--- /dev/null
+++ b/src/include/access/csn_log.h
@@ -0,0 +1,83 @@
+/*
+ * csn_log.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "storage/sync.h"
+#include "utils/snapshot.h"
+
+
+typedef struct xl_csn_set
+{
+	CSN				csn;
+	TransactionId	xtop;			/* XID's top-level XID */
+	int				nsubxacts;		/* number of subtransaction XIDs */
+	TransactionId	xsub[FLEXIBLE_ARRAY_MEMBER];	/* assigned subxids */
+} xl_csn_set;
+
+
+/* XLOG stuff */
+#define XLOG_CSN_ASSIGNMENT	0x00
+#define XLOG_CSN_SETCSN		0x10
+#define XLOG_CSN_ZEROPAGE	0x20
+#define XLOG_CSN_TRUNCATE	0x30
+
+/*
+ * We should log MAX generated CSN to wal, so that database will not generate
+ * a historical CSN after database restart. This may appear when system time
+ * turned back.
+ *
+ * However we can not log the MAX CSN every time it generated, if so it will
+ * cause too many wal expend, so we log it 5s more in the future.
+ *
+ * As a trade off, when this database restart, there will be 5s bad performance
+ * for time synchronization among postgres instances.
+ *
+ * It looks like we can redefine this as a configure parameter, and the user
+ * can decide which way they prefer.
+ *
+ */
+#define	CSN_ASSIGN_TIME_INTERVAL	5
+
+#define MinSizeOfCSNSet offsetof(xl_csn_set, xsub)
+#define	CSNAddByNanosec(csn,second) (csn + second * 1000000000L)
+
+
+extern void CSNLogSetCSN(TransactionId xid, int nsubxids,
+						 TransactionId *subxids, CSN csn, bool write_xlog);
+extern CSN CSNLogGetCSNByXid(TransactionId xid);
+
+extern Size CSNLogShmemSize(void);
+extern void CSNLogShmemInit(void);
+extern void ShutdownCSNLog(void);
+extern void CheckPointCSNLog(void);
+extern void ExtendCSNLog(TransactionId newestXact);
+extern void TruncateCSNLog(TransactionId oldestXact);
+extern int csnlogsyncfiletag(const FileTag *ftag, char *path);
+
+extern void csnlog_redo(XLogReaderState *record);
+extern void csnlog_desc(StringInfo buf, XLogReaderState *record);
+extern const char *csnlog_identify(uint8 info);
+extern void WriteAssignCSNXlogRec(CSN csn);
+extern CSN set_last_max_csn(CSN csn, bool force);
+extern CSN get_last_max_csn(void);
+extern CSN get_xmin_for_csn(void);
+extern void prepare_csn_env(bool enable_csn_snapshot);
+extern void CatchCSNLog(void);
+extern void ActivateCSNlog(void);
+extern void DeactivateCSNlog(void);
+extern void StartupCSN(void);
+extern void CompleteCSNInitialization(void);
+extern void CSNlogParameterChange(bool newvalue, bool oldvalue);
+extern bool get_csnlog_status(void);
+
+#endif   /* CSNLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index f582cf535f..3cf0775176 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
 PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
 PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+PG_RMGR(RM_CSNLOG_ID, "CSN", csnlog_redo, csnlog_desc, csnlog_identify, NULL, NULL, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index b23e286406..d9690a23bb 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -240,6 +240,7 @@ typedef struct xl_parameter_change
 	int			wal_level;
 	bool		wal_log_hints;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 } xl_parameter_change;
 
 /* logs restore point */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e3f48158ce..194d100ed4 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -61,6 +61,8 @@ typedef struct CheckPoint
 	 * set to InvalidTransactionId.
 	 */
 	TransactionId oldestActiveXid;
+
+	uint64 oldestUsedCSN;
 } CheckPoint;
 
 /* XLOG info values for XLOG rmgr */
@@ -181,6 +183,7 @@ typedef struct ControlFileData
 	int			max_prepared_xacts;
 	int			max_locks_per_xact;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a8f052e484..65d1e49fb2 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -168,6 +168,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFER,
 	LWTRANCHE_SUBTRANS_BUFFER,
+	LWTRANCHE_CSN_LOG_BUFFERS,
 	LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 	LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 	LWTRANCHE_NOTIFY_BUFFER,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 2fd1ff09a7..f0e920f462 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -17,6 +17,7 @@
 #include "access/clog.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
+#include "utils/snapshot.h"
 #include "storage/latch.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index fbdf34f762..e49dd48fe8 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -39,6 +39,7 @@ typedef enum SyncRequestHandler
 	SYNC_HANDLER_COMMIT_TS,
 	SYNC_HANDLER_MULTIXACT_OFFSET,
 	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_CSN_LOG,
 	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 6b60755c53..92575a207f 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -121,6 +121,11 @@ typedef enum SnapshotType
 typedef struct SnapshotData *Snapshot;
 
 #define InvalidSnapshot		((Snapshot) NULL)
+#define InvalidCSN			((CSN) 0)
+typedef uint64 CSN;
+
+extern bool enable_csn_snapshot;
+extern bool enable_csn_wal;
 
 /*
  * Struct representing all kind of possible snapshots.
@@ -208,6 +213,12 @@ typedef struct SnapshotData
 	TimestampTz whenTaken;		/* timestamp when snapshot was taken */
 	XLogRecPtr	lsn;			/* position in the WAL stream when taken */
 
+	/*
+	 * CSN for snapshot isolation support.
+	 * Will be used only if enable_csn_snapshot is enabled.
+	 */
+	CSN	csn;
+
 	/*
 	 * The transaction completion count at the time GetSnapshotData() built
 	 * this snapshot. Allows to avoid re-computing static snapshots when no
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a62bf5dc92..e21403028d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -96,6 +96,8 @@ select name, setting from pg_settings where name like 'enable%';
               name              | setting 
 --------------------------------+---------
  enable_bitmapscan              | on
+ enable_csn_snapshot            | off
+ enable_csn_wal                 | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -114,7 +116,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(19 rows)
+(21 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
-- 
2.25.1

0002-CSN-Snapshot.patchtext/x-patch; charset=UTF-8; name=0002-CSN-Snapshot.patchDownload

From 22d80716f06e1b6000f788b5b96da70c0d7aa581 Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Mon, 22 Mar 2021 16:36:38 +0500
Subject: [PATCH 2/4] CSN Snapshot

---
 doc/src/sgml/config.sgml                  |  50 +-
 src/backend/access/transam/Makefile       |   1 +
 src/backend/access/transam/csn_log.c      |  30 +-
 src/backend/access/transam/csn_snapshot.c | 724 ++++++++++++++++++++++
 src/backend/access/transam/twophase.c     | 158 +++++
 src/backend/access/transam/xact.c         |  29 +
 src/backend/access/transam/xlog.c         |  26 +-
 src/backend/storage/ipc/ipci.c            |   3 +
 src/backend/storage/ipc/procarray.c       |  75 +++
 src/backend/storage/lmgr/lwlocknames.txt  |   1 +
 src/backend/storage/lmgr/proc.c           |   5 +
 src/backend/utils/misc/guc.c              |  21 +
 src/backend/utils/time/snapmgr.c          | 146 ++++-
 src/bin/pg_waldump/csnlogdesc.c           |   1 +
 src/include/access/csn_log.h              |   3 +-
 src/include/access/csn_snapshot.h         |  65 ++
 src/include/catalog/pg_control.h          |   6 +
 src/include/catalog/pg_proc.dat           |  14 +
 src/include/datatype/timestamp.h          |   2 +
 src/include/fmgr.h                        |   1 +
 src/include/storage/proc.h                |  13 +
 src/include/storage/procarray.h           |   3 +
 src/include/utils/snapmgr.h               |   2 +
 src/include/utils/snapshot.h              |   3 +-
 24 files changed, 1367 insertions(+), 15 deletions(-)
 create mode 100644 src/backend/access/transam/csn_snapshot.c
 create mode 120000 src/bin/pg_waldump/csnlogdesc.c
 create mode 100644 src/include/access/csn_snapshot.h

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5679b40dd5..fd421ea84d 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9409,8 +9409,56 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
      </varlistentry>
 
      </variablelist>
-   </sect1>
 
+    <sect2 id="runtime-config-CSN-base-snapshot">
+     <title>CSN Based Snapshot</title>
+
+      <para>
+       By default, The snapshots in <productname>PostgreSQL</productname> uses the
+       XID (TransactionID) to identify the status of the transaction, the in-progress
+       transactions, and the future transactions for all its visibility calculations.
+      </para>
+
+      <para>
+       <productname>PostgreSQL</productname> also provides the CSN (commit-sequence-number)
+        based mechanism to identify the past-transactions and the ones that are yet to
+        be started/committed.
+      </para>
+
+     <variablelist>
+      <varlistentry id="guc-enable-csn-snapshot" xreflabel="enable_csn_snapshot">
+       <term><varname>enable_csn_snapshot</varname> (<type>boolean</type>)
+        <indexterm>
+         <primary><varname>enable_csn_snapshot</varname> configuration parameter</primary>
+        </indexterm>
+       </term>
+       <listitem>
+
+        <para>
+        Enable/disable the CSN based transaction visibility tracking for the snapshot.
+        </para>
+
+        <para>
+        <productname>PostgreSQL</productname> uses the clock timestamp as a CSN,
+        so enabling the CSN based snapshots can be useful for implementing the global
+        snapshots and global transaction visibility.
+        </para>
+
+        <para>
+         when enabled <productname>PostgreSQL</productname> creates
+         <filename>pg_csn</filename> directory under <envar>PGDATA</envar> to keep
+         the track of CSN and XID mappings.
+        </para>
+
+        <para>
+         The default value is off.
+        </para>
+       </listitem>
+      </varlistentry>
+
+     </variablelist>
+    </sect2>
+   </sect1>
    <sect1 id="runtime-config-compatible">
     <title>Version and Platform Compatibility</title>
 
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index d1196cba87..fc0321ee6b 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	clog.o \
 	commit_ts.o \
 	csn_log.o \
+	csn_snapshot.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c
index 7a6d63e953..9511ecf48a 100644
--- a/src/backend/access/transam/csn_log.c
+++ b/src/backend/access/transam/csn_log.c
@@ -659,20 +659,38 @@ get_xmin_for_csn(void)
 }
 
 void
-prepare_csn_env(bool enable)
+prepare_csn_env(bool enable, bool same, TransactionId *xmin_for_csn_in_control)
 {
 	if (enable)
 	{
-		TransactionId nextxid =
+		if (same)
+		{
+			/*
+			 * Database startup with no enable_csn_snapshot change and value is true,
+			 * it can just transmit xmin_for_csn from pg_control to csnState->xmin_for_csn.
+			 */
+			csnShared->xmin_for_csn = *xmin_for_csn_in_control;
+		}
+		else
+		{
+			TransactionId nextxid =
 						XidFromFullTransactionId(ShmemVariableCache->nextXid);
-		/* 'xmin_for_csn' for when turn xid-snapshot to csn-snapshot */
-		csnShared->xmin_for_csn = nextxid;
-		/* produce the csnlog segment we want now and seek to current page */
-		ActivateCSNlog();
+
+			/* 'xmin_for_csn' for when turn xid-snapshot to csn-snapshot */
+			csnShared->xmin_for_csn = nextxid;
+			*xmin_for_csn_in_control = nextxid;
+
+			/* produce the csnlog segment we want now and seek to current page */
+			ActivateCSNlog();
+		}
 	}
 	else
+	{
 		/* Try to drop all csnlog seg */
 		DeactivateCSNlog();
+		/* Clear xmin_for_csn in pg_control because we are xid-base snaposhot now. */
+		*xmin_for_csn_in_control = InvalidTransactionId;
+	}
 }
 
 /*
diff --git a/src/backend/access/transam/csn_snapshot.c b/src/backend/access/transam/csn_snapshot.c
new file mode 100644
index 0000000000..33d769857f
--- /dev/null
+++ b/src/backend/access/transam/csn_snapshot.c
@@ -0,0 +1,724 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.c
+ *		Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_snapshot.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <sys/time.h>
+
+#include "access/csn_log.h"
+#include "access/csn_snapshot.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+#include "miscadmin.h"
+
+/* Raise a warning if imported global_csn exceeds ours by this value. */
+#define SNAP_DESYNC_COMPLAIN (1*NSECS_PER_SEC) /* 1 second */
+
+TransactionId 	 xmin_for_csn = InvalidTransactionId;
+
+/*
+ * GUC to delay advance of oldestXid for this amount of time. Also determines
+ * the size CSNSnapshotXidMap circular buffer.
+ */
+int csn_snapshot_defer_time;
+
+/* DEBUG variable to simplify time skew modelling in tests. */
+int csn_time_shift;
+
+/*
+ * CSNSnapshotXidMap
+ *
+ * To be able to install csn snapshot that points to past we need to keep
+ * old versions of tuples and therefore delay advance of oldestXid.  Here we
+ * keep track of correspondence between snapshot's snapshot_csn and oldestXid
+ * that was set at the time when the snapshot was taken.  Much like the
+ * snapshot too old's OldSnapshotControlData does, but with finer granularity
+ * to seconds.
+ *
+ * Different strategies can be employed to hold oldestXid (e.g. we can track
+ * oldest csn-based snapshot among cluster nodes and map it oldestXid
+ * on each node).
+ *
+ * On each snapshot acquisition CSNSnapshotMapXmin() is called and stores
+ * correspondence between current snapshot_csn and oldestXmin in a sparse way:
+ * snapshot_csn is rounded to seconds (and here we use the fact that snapshot_csn
+ * is just a timestamp) and oldestXmin is stored in the circular buffer where
+ * rounded snapshot_csn acts as an offset from current circular buffer head.
+ * Size of the circular buffer is controlled by csn_snapshot_defer_time GUC.
+ *
+ * When csn snapshot arrives we check that its
+ * snapshot_csn is still in our map, otherwise we'll error out with "snapshot too
+ * old" message.  If snapshot_csn is successfully mapped to oldestXid we move
+ * backend's pgxact->xmin to proc->originalXmin and fill pgxact->xmin to
+ * mapped oldestXid.  That way GetOldestXmin() can take into account backends
+ * with imported csn snapshot and old tuple versions will be preserved.
+ *
+ * Also while calculating oldestXmin for our map in presence of imported
+ * csn snapshots we should use proc->originalXmin instead of pgxact->xmin
+ * that was set during import.  Otherwise, we can create a feedback loop:
+ * xmin's of imported csn snapshots were calculated using our map and new
+ * entries in map going to be calculated based on that xmin's, and there is
+ * a risk to stuck forever with one non-increasing oldestXmin.  All other
+ * callers of GetOldestXmin() are using pgxact->xmin so the old tuple versions
+ * are preserved.
+ */
+typedef struct CSNSnapshotXidMap
+{
+	int				 head;				/* offset of current freshest value */
+	int				 size;				/* total size of circular buffer */
+	CSN_atomic		 last_csn_seconds;	/* last rounded csn that changed
+										 * xmin_by_second[] */
+	TransactionId   *xmin_by_second;	/* circular buffer of oldestXmin's */
+}
+CSNSnapshotXidMap;
+
+static CSNSnapshotXidMap *csnXidMap;
+
+/* Estimate shared memory space needed */
+Size
+CSNSnapshotShmemSize(void)
+{
+	Size	size = 0;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		size += sizeof(CSNSnapshotXidMap);
+		size += csn_snapshot_defer_time * sizeof(TransactionId);
+		size = MAXALIGN(size);
+	}
+
+	return size;
+}
+
+/* Init shared memory structures */
+void
+CSNSnapshotShmemInit(void)
+{
+	bool found;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		csnXidMap = ShmemInitStruct("gsXidMap",
+								   sizeof(CSNSnapshotXidMap),
+								   &found);
+		if (!found)
+		{
+			int i;
+
+			pg_atomic_init_u64(&csnXidMap->last_csn_seconds,
+							   (uint64) get_last_max_csn());
+			csnXidMap->head = 0;
+			csnXidMap->size = csn_snapshot_defer_time;
+			csnXidMap->xmin_by_second =
+							ShmemAlloc(sizeof(TransactionId) * csnXidMap->size);
+
+			for (i = 0; i < csnXidMap->size; i++)
+				csnXidMap->xmin_by_second[i] = InvalidTransactionId;
+		}
+	}
+}
+
+/*
+ * GenerateCSN
+ *
+ * Generate CSN which is actually a local time. Also we are forcing
+ * this time to be always increasing. For portability reasons gettimeofday()
+ * and milliseconds resolution are used, but always increasing time helps to
+ * handle multiple transactions in one ms.
+ */
+CSN
+GenerateCSN(CSN assign)
+{
+	struct timeval  current_time;
+	CSN	csn;
+
+	Assert(get_csnlog_status() || csn_snapshot_defer_time > 0);
+
+	if(assign != InvalidCSN && assign < get_last_max_csn())
+	{
+		/*
+		 * While assign not 0, we just want to make sure we log the max csn
+		 * in last_max_csn. So we not care about the return value.
+		 *
+		 * We can return an Invalid value.
+		 */
+		return InvalidCSN;
+	}
+
+	/*
+	 * TODO: create some macro that add small random shift to current time.
+	 */
+	gettimeofday(&current_time, NULL);
+	csn = (CSN) (((uint64) (current_time.tv_sec + csn_time_shift) *
+		(uint64) 1000000000) + (uint64) (current_time).tv_usec * (uint64) 1000);
+
+	if (assign != InvalidCSN && csn < assign)
+		csn = assign;
+
+	csn = set_last_max_csn(csn, false);
+	WriteAssignCSNXlogRec(csn);
+	return csn;
+}
+
+/*
+ * CSNSnapshotPrepareCurrent
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+CSN
+CSNSnapshotPrepareCurrent(void)
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (TransactionIdIsValid(xid))
+	{
+		TransactionId *subxids;
+		int nsubxids = xactGetCommittedChildren(&subxids);
+		CSNLogSetCSN(xid, nsubxids, subxids, InDoubtCSN, true);
+	}
+
+	/* Nothing to write if we don't have xid */
+
+	return GenerateCSN(InvalidCSN);
+}
+
+
+/*
+ * CSNSnapshotAssignCurrent
+ *
+ * Assign CSN to the currently active transaction. CSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ */
+void
+CSNSnapshotAssignCurrent(CSN csn)
+{
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal csn")));
+
+	Assert(csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnState->last_max_csn value.
+	 */
+	GenerateCSN(csn);
+
+	/* Set csn and defuse ProcArrayEndTransaction from assigning one */
+	pg_atomic_write_u64(&MyProc->assignedCSN, csn);
+}
+
+/*
+ * CSNSnapshotSync
+ *
+ * Due to time desynchronization on different nodes we can receive snapshot_csn
+ * which is greater than snapshot_csn on this node. To preserve proper isolation
+ * this node needs to wait when such snapshot_csn comes on local clock.
+ *
+ * This should happend relatively rare if nodes have running NTP/PTP/etc.
+ * Complain if wait time is more than SNAP_SYNC_COMPLAIN.
+ */
+void
+CSNSnapshotSync(CSN remote_csn)
+{
+	CSN	local_csn;
+	CSN	delta;
+
+	Assert(enable_csn_snapshot);
+
+	for(;;)
+	{
+		if (get_last_max_csn() > remote_csn)
+			/* Everything is fine */
+			return;
+		else if ((local_csn = GenerateCSN(InvalidCSN)) >= remote_csn)
+			/*
+			 * Everything is fine too, but last_max_csn wasn't updated for
+			 * some time.
+			 */
+			return;
+
+		/* Okay we need to sleep now */
+		delta = remote_csn - local_csn;
+		if (delta > SNAP_DESYNC_COMPLAIN)
+			ereport(WARNING,
+				(errmsg("remote global snapshot exceeds ours by more than a second"),
+				 errhint("Consider running NTPd on servers participating in global transaction")));
+
+		/* TODO: report this sleeptime somewhere? */
+		pg_usleep((long) (delta/NSECS_PER_USEC));
+
+		/*
+		 * Loop that checks to ensure that we actually slept for specified
+		 * amount of time.
+		 */
+	}
+
+	Assert(false); /* Should not happend */
+	return;
+}
+
+/*
+ * TransactionIdGetCSN
+ *
+ * Get CSN for specified TransactionId taking care about special xids,
+ * xids beyond TransactionXmin and InDoubt states.
+ */
+CSN
+TransactionIdGetCSN(TransactionId xid)
+{
+	CSN csn;
+
+	Assert(get_csnlog_status());
+
+	/* Handle permanent TransactionId's for which we don't have mapping */
+	if (!TransactionIdIsNormal(xid))
+	{
+		if (xid == InvalidTransactionId)
+			return AbortedCSN;
+		if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+			return FrozenCSN;
+		Assert(false); /* Should not happend */
+	}
+
+	/*
+	 * If we just switch a xid-snapsot to a csn_snapshot, we should handle a start
+	 * xid for csn base check. Just in case we have prepared transaction which
+	 * hold the TransactionXmin but without CSN.
+	 */
+	if (xmin_for_csn == InvalidTransactionId)
+	{
+		csn = get_xmin_for_csn();
+
+		/*
+		 * If the instance started with enabled csn snapshots, use csn of its
+		 * start point. If csn snapshots were enabled in this backend only, use
+		 * xmin of this transaction.
+		 */
+		xmin_for_csn = (csn != InvalidTransactionId) ? csn : TransactionXmin;
+	}
+
+	/*
+	 * For the xid with 'xid >= TransactionXmin and xid < xmin_for_csn',
+	 * it defined as unclear csn which follow xid-snapshot result.
+	 */
+	if(!TransactionIdPrecedes(xid, TransactionXmin) &&
+							TransactionIdPrecedes(xid, xmin_for_csn))
+		return UnclearCSN;
+
+	/*
+	 * For xids which less then TransactionXmin CSNLog can be already
+	 * trimmed but we know that such transaction is definetly not concurrently
+	 * running according to any snapshot including timetravel ones. Callers
+	 * should check TransactionDidCommit after.
+	 */
+	if (TransactionIdPrecedes(xid, xmin_for_csn))
+		return FrozenCSN;
+
+	/* Read CSN from SLRU */
+	csn = CSNLogGetCSNByXid(xid);
+
+	/*
+	 * If we faced InDoubt state then transaction is being committed and we
+	 * should wait until CSN will be assigned so that visibility check
+	 * could decide whether tuple is in snapshot. See also comments in
+	 * CSNSnapshotPrecommit().
+	 */
+	if (CSNIsInDoubt(csn))
+	{
+		XactLockTableWait(xid, NULL, NULL, XLTW_None);
+		csn = CSNLogGetCSNByXid(xid);
+		Assert(CSNIsNormal(csn) || CSNIsAborted(csn));
+	}
+
+	Assert(CSNIsNormal(csn) || CSNIsInProgress(csn) || CSNIsAborted(csn));
+
+	return csn;
+}
+
+/*
+ * CSNSnapshotStartup
+ *
+ * Set csnXidMap entries to oldestActiveXID during startup.
+ */
+void
+CSNSnapshotStartup(TransactionId oldestActiveXID)
+{
+	/*
+	 * Run only if we have initialized shared memory and csnXidMap
+	 * is enabled.
+	 */
+	if (IsNormalProcessingMode() &&
+		enable_csn_snapshot && csn_snapshot_defer_time > 0)
+	{
+		int i;
+
+		Assert(TransactionIdIsValid(oldestActiveXID));
+		for (i = 0; i < csnXidMap->size; i++)
+			csnXidMap->xmin_by_second[i] = oldestActiveXID;
+
+		ProcArraySetCSNSnapshotXmin(oldestActiveXID);
+	}
+}
+
+/*
+ * CSNSnapshotMapXmin
+ *
+ * Maintain circular buffer of oldestXmins for several seconds in past. This
+ * buffer allows to shift oldestXmin in the past when backend is importing
+ * CSN snapshot. Otherwise old versions of tuples that were needed for
+ * this transaction can be recycled by other processes (vacuum, HOT, etc).
+ *
+ * Locking here is not trivial. Called upon each snapshot creation after
+ * ProcArrayLock is released. Such usage creates several race conditions. It
+ * is possible that backend who got csn called CSNSnapshotMapXmin()
+ * only after other backends managed to get snapshot and complete
+ * CSNSnapshotMapXmin() call, or even committed. This is safe because
+ *
+ *		* We already hold our xmin in MyProc, so our snapshot will not be
+ *		  harmed even though ProcArrayLock is released.
+ *
+ *		* snapshot_csn is always pessimistically rounded up to the next
+ *		  second.
+ *
+ *		* For performance reasons, xmin value for particular second is filled
+ *		  only once. Because of that instead of writing to buffer just our
+ *		  xmin (which is enough for our snapshot), we bump oldestXmin there --
+ *		  it mitigates the possibility of damaging someone else's snapshot by
+ *		  writing to the buffer too advanced value in case of slowness of
+ *		  another backend who generated csn earlier, but didn't manage to
+ *		  insert it before us.
+ *
+ *		* if CSNSnapshotMapXmin() founds a gap in several seconds between
+ *		  current call and latest completed call then it should fill that gap
+ *		  with latest known values instead of new one. Otherwise it is
+ *		  possible (however highly unlikely) that this gap also happend
+ *		  between taking snapshot and call to CSNSnapshotMapXmin() for some
+ *		  backend. And we are at risk to fill circullar buffer with
+ *		  oldestXmin's that are bigger then they actually were.
+ */
+void
+CSNSnapshotMapXmin(CSN snapshot_csn)
+{
+	int offset, gap, i;
+	CSN csn_seconds;
+	CSN last_csn_seconds;
+	volatile TransactionId oldest_deferred_xmin;
+	TransactionId current_oldest_xmin;
+	TransactionId previous_oldest_xmin;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+	/*
+	 * Round up snapshot_csn to the next second -- pessimistically and safely.
+	 */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC + 1);
+
+	/*
+	 * Fast-path check. Avoid taking exclusive CSNSnapshotXidMapLock lock
+	 * if oldestXid was already written to xmin_by_second[] for this rounded
+	 * snapshot_csn.
+	 */
+	if (pg_atomic_read_u64(&csnXidMap->last_csn_seconds) >= csn_seconds)
+		return;
+
+	/* Ok, we have new entry (or entries) */
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_EXCLUSIVE);
+
+	/* Re-check last_csn_seconds under lock */
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (last_csn_seconds >= csn_seconds)
+	{
+		LWLockRelease(CSNSnapshotXidMapLock);
+		return;
+	}
+	pg_atomic_write_u64(&csnXidMap->last_csn_seconds, csn_seconds);
+
+	/*
+	 * Count oldest_xmin.
+	 *
+	 * It was possible to calculate oldest_xmin during corresponding snapshot
+	 * creation, but GetSnapshotData() intentionally reads only PgXact, but not
+	 * PgProc. And we need info about originalXmin (see comment to csnXidMap)
+	 * which is stored in PgProc because of threats in comments around PgXact
+	 * about extending it with new fields. So just calculate oldest_xmin again,
+	 * that anyway happens quite rarely.
+	 */
+	current_oldest_xmin = GetOldestNonRemovableTransactionId(NULL);
+	Assert(TransactionIdIsNormal(current_oldest_xmin));
+
+	previous_oldest_xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	Assert(TransactionIdIsNormal(previous_oldest_xmin) || !enable_csn_snapshot);
+
+	gap = csn_seconds - last_csn_seconds;
+	offset = csn_seconds % csnXidMap->size;
+
+	/* Sanity check before we update head and gap */
+	Assert( gap >= 1 );
+	Assert( (csnXidMap->head + gap) % csnXidMap->size == offset );
+
+	gap = gap > csnXidMap->size ? csnXidMap->size : gap;
+	csnXidMap->head = offset;
+
+	/* Fill new entry with current_oldest_xmin */
+	csnXidMap->xmin_by_second[offset] = current_oldest_xmin;
+
+	/*
+	 * If we have gap then fill it with previous_oldest_xmin for reasons
+	 * outlined in comment above this function.
+	 */
+	for (i = 1; i < gap; i++)
+	{
+		offset = (offset + csnXidMap->size - 1) % csnXidMap->size;
+		csnXidMap->xmin_by_second[offset] = previous_oldest_xmin;
+	}
+
+	oldest_deferred_xmin =
+		csnXidMap->xmin_by_second[ (csnXidMap->head + 1) % csnXidMap->size ];
+
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	/*
+	 * Advance procArray->csn_snapshot_xmin after we released
+	 * CSNSnapshotXidMapLock. Since we gather not xmin but oldestXmin, it
+	 * never goes backwards regardless of how slow we can do that.
+	 */
+	Assert(TransactionIdFollowsOrEquals(oldest_deferred_xmin,
+										ProcArrayGetCSNSnapshotXmin()));
+	ProcArraySetCSNSnapshotXmin(oldest_deferred_xmin);
+}
+
+
+/*
+ * CSNSnapshotToXmin
+ *
+ * Get oldestXmin that took place when snapshot_csn was taken.
+ */
+TransactionId
+CSNSnapshotToXmin(CSN snapshot_csn)
+{
+	TransactionId xmin;
+	CSN csn_seconds;
+	volatile CSN last_csn_seconds;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+
+	/* Round down to get conservative estimates */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC);
+
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_SHARED);
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (csn_seconds > last_csn_seconds)
+	{
+		/* we don't have entry for this snapshot_csn yet, return latest known */
+		xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	}
+	else if (last_csn_seconds - csn_seconds < csnXidMap->size)
+	{
+		/* we are good, retrieve value from our map */
+		Assert(last_csn_seconds % csnXidMap->size == csnXidMap->head);
+		xmin = csnXidMap->xmin_by_second[csn_seconds % csnXidMap->size];
+	}
+	else
+	{
+		/* requested snapshot_csn is too old, let caller know */
+		xmin = InvalidTransactionId;
+	}
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	return xmin;
+}
+
+/*
+ * XidInvisibleInCSNSnapshot
+ *
+ * Version of XidInMVCCSnapshot for transactions. For non-imported
+ * csn snapshots this should give same results as XidInLocalMVCCSnapshot
+ * (except that aborts will be shown as invisible without going to clog) and to
+ * ensure such behaviour XidInMVCCSnapshot is coated with asserts that checks
+ * identicalness of XidInvisibleInCSNSnapshot/XidInLocalMVCCSnapshot in
+ * case of ordinary snapshot.
+ */
+bool
+XidInvisibleInCSNSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	CSN csn;
+
+	Assert(get_csnlog_status());
+
+	csn = TransactionIdGetCSN(xid);
+
+	if (CSNIsNormal(csn))
+		return (csn < snapshot->csn) ? false : true;
+	else if (CSNIsFrozen(csn))
+		/* It is bootstrap or frozen transaction */
+		return false;
+	else if (CSNIsUnclear(csn))
+		/*
+		 * Some xid can not figure out csn because of snapshot switch,
+		 * and we can follow xid-base result.
+		 */
+		return true;
+	else
+	{
+		/* It is aborted or in-progress */
+		Assert(CSNIsAborted(csn) || CSNIsInProgress(csn));
+		if (CSNIsAborted(csn))
+			Assert(TransactionIdDidAbort(xid));
+		return true;
+	}
+}
+
+
+/*****************************************************************************
+ * Functions to handle transactions commit.
+ *
+ * For local transactions CSNSnapshotPrecommit sets InDoubt state before
+ * ProcArrayEndTransaction is called and transaction data potetntially becomes
+ * visible to other backends. ProcArrayEndTransaction (or ProcArrayRemove in
+ * twophase case) then acquires csn under ProcArray lock and stores it
+ * in proc->assignedCSN. It's important that csn for commit is
+ * generated under ProcArray lock, otherwise snapshots won't
+ * be equivalent. Consequent call to CSNSnapshotCommit will write
+ * proc->assignedCSN to CSNLog.
+ *
+ *
+ * CSNSnapshotAbort is slightly different comparing to commit because abort
+ * can skip InDoubt phase and can be called for transaction subtree.
+ *****************************************************************************/
+
+
+/*
+ * CSNSnapshotAbort
+ *
+ * Abort transaction in CsnLog. We can skip InDoubt state for aborts
+ * since no concurrent transactions allowed to see aborted data anyway.
+ */
+void
+CSNSnapshotAbort(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	if (!get_csnlog_status())
+		return;
+
+	CSNLogSetCSN(xid, nsubxids, subxids, AbortedCSN, true);
+
+	/*
+	 * Clean assignedCSN anyway, as it was possibly set in
+	 * XidSnapshotAssignCsnCurrent.
+	 */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
+
+/*
+ * CSNSnapshotPrecommit
+ *
+ * Set InDoubt status for local transaction that we are going to commit.
+ * This step is needed to achieve consistency between local snapshots and
+ * csn-based snapshots. We don't hold ProcArray lock while writing
+ * csn for transaction in SLRU but instead we set InDoubt status before
+ * transaction is deleted from ProcArray so the readers who will read csn
+ * in the gap between ProcArray removal and CSN assignment can wait
+ * until CSN is finally assigned. See also TransactionIdGetCSN().
+ *
+ * This should be called only from parallel group leader before backend is
+ * deleted from ProcArray.
+ */
+void
+CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	CSN oldassignedCSN = InProgressCSN;
+	bool in_progress;
+
+	if (!get_csnlog_status())
+		return;
+
+	/* Set InDoubt status if it is local transaction */
+	in_progress = pg_atomic_compare_exchange_u64(&proc->assignedCSN,
+												 &oldassignedCSN,
+												 InDoubtCSN);
+	if (in_progress)
+	{
+		Assert(CSNIsInProgress(oldassignedCSN));
+		CSNLogSetCSN(xid, nsubxids,
+						   subxids, InDoubtCSN, true);
+	}
+	else
+	{
+		/* Otherwise we should have valid CSN by this time */
+		Assert(CSNIsNormal(oldassignedCSN));
+		Assert(CSNIsInDoubt(CSNLogGetCSNByXid(xid)));
+	}
+}
+
+/*
+ * CSNSnapshotCommit
+ *
+ * Write CSN that were acquired earlier to CsnLog. Should be
+ * preceded by CSNSnapshotPrecommit() so readers can wait until we finally
+ * finished writing to SLRU.
+ *
+ * Should be called after ProcArrayEndTransaction, but before releasing
+ * transaction locks, so that TransactionIdGetCSN can wait on this
+ * lock for CSN.
+ */
+void
+CSNSnapshotCommit(PGPROC *proc, TransactionId xid,
+				  int nsubxids, TransactionId *subxids)
+{
+	volatile CSN assignedCSN;
+
+	if (!get_csnlog_status())
+		return;
+
+	if (!TransactionIdIsValid(xid))
+	{
+		assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+		Assert(CSNIsInProgress(assignedCSN));
+		return;
+	}
+
+	/* Finally write resulting CSN in SLRU */
+	assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+	Assert(CSNIsNormal(assignedCSN));
+	CSNLogSetCSN(xid, nsubxids, subxids, assignedCSN, true);
+
+	/* Reset for next transaction */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 89335b64a2..6b91953193 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,8 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
+#include "access/csn_snapshot.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
@@ -1483,8 +1485,34 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nabortrels, abortrels,
 									   gid);
 
+	/*
+	 * CSNSnapshot callbacks that should be called right before we are
+	 * going to become visible. Details in comments to this functions.
+	 */
+	if (isCommit)
+		CSNSnapshotPrecommit(proc, xid, hdr->nsubxacts, children);
+	else
+		CSNSnapshotAbort(proc, xid, hdr->nsubxacts, children);
+
+
 	ProcArrayRemove(proc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CSNLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks, since TransactionIdGetCSN relies on
+	 * XactLockTableWait to await csn.
+	 */
+	if (isCommit)
+	{
+		CSNSnapshotCommit(proc, xid, hdr->nsubxacts, children);
+	}
+	else
+	{
+		Assert(CSNIsInProgress(
+				   pg_atomic_read_u64(&proc->assignedCSN)));
+	}
+
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
 	 * no one else will try to commit/rollback, and so it will be recycled if
@@ -2458,3 +2486,133 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 		RemoveTwoPhaseFile(xid, giveWarning);
 	RemoveGXact(gxact);
 }
+
+/*
+ * CSNSnapshotPrepareTwophase
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+static CSN
+CSNSnapshotPrepareTwophase(const char *gid)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+	char	   *buf;
+	TransactionId xid;
+	xl_xact_parsed_prepare parsed;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+	xid = proc->xid;
+
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, true);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+	ParsePrepareRecord(0, (xl_xact_prepare *)buf, &parsed);
+
+	CSNLogSetCSN(xid, parsed.nsubxacts,
+					parsed.subxacts, InDoubtCSN, true);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	pfree(buf);
+
+	return GenerateCSN(InvalidCSN);
+}
+
+/*
+ * CSNSnapshotAssignTwoPhase
+ *
+ * Asign CSN for currently active transaction. CSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ *
+ * This function is a counterpart of CSNSnapshotAssignCurrent() for
+ * twophase transactions.
+ */
+static void
+CSNSnapshotAssignTwoPhase(const char *gid, CSN csn)
+{
+	GlobalTransaction	gxact;
+	PGPROC				*proc;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal csn")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	Assert(csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnState->last_max_csn value.
+	 */
+	GenerateCSN(csn);
+	/* Set csn and defuse ProcArrayRemove from assigning one. */
+	pg_atomic_write_u64(&proc->assignedCSN, csn);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * SQL interface to CSNSnapshotPrepareTwophase()
+ *
+ * TODO: Rewrite this as PREPARE TRANSACTION 'gid' RETURNING SNAPSHOT
+ */
+Datum
+pg_csn_snapshot_prepare(PG_FUNCTION_ARGS)
+{
+	const char 	*gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	CSN	csn;
+
+	csn = CSNSnapshotPrepareTwophase(gid);
+
+	PG_RETURN_INT64(csn);
+}
+
+/*
+ * SQL interface to CSNSnapshotAssignTwoPhase()
+ *
+ * TODO: Rewrite this as COMMIT PREPARED 'gid' SNAPSHOT 'csn'
+ */
+Datum
+pg_csn_snapshot_assign(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	CSN	csn = PG_GETARG_INT64(1);
+
+	CSNSnapshotAssignTwoPhase(gid, csn);
+	PG_RETURN_VOID();
+}
\ No newline at end of file
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 6395a9b240..deabedbe37 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -1486,6 +1487,14 @@ RecordTransactionCommit(void)
 
 	/* Reset XactLastRecEnd until the next transaction writes something */
 	XactLastRecEnd = 0;
+
+	/*
+	 * Mark our transaction as InDoubt in CsnLog and get ready for
+	 * commit.
+	 */
+	if (markXidCommitted)
+		CSNSnapshotPrecommit(MyProc, xid, nchildren, children);
+
 cleanup:
 	/* Clean up local data */
 	if (rels)
@@ -1747,6 +1756,11 @@ RecordTransactionAbort(bool isSubXact)
 	 */
 	TransactionIdAbortTree(xid, nchildren, children);
 
+	/*
+	 * Mark our transaction as Aborted in CsnLog.
+	 */
+	CSNSnapshotAbort(MyProc, xid, nchildren, children);
+
 	END_CRIT_SECTION();
 
 	/* Compute latestXid while we have the child XIDs handy */
@@ -2237,6 +2251,21 @@ CommitTransaction(void)
 	 */
 	ProcArrayEndTransaction(MyProc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks.
+	 */
+	if (!is_parallel_worker)
+	{
+		TransactionId xid = GetTopTransactionIdIfAny();
+		TransactionId *subxids;
+		int nsubxids;
+
+		nsubxids = xactGetCommittedChildren(&subxids);
+		CSNSnapshotCommit(MyProc, xid, nsubxids, subxids);
+	}
+
 	/*
 	 * This is all post-commit cleanup.  Note that if an error is raised here,
 	 * it's too late to abort the transaction.  This should be just
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 46f016dce7..200c387084 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4652,6 +4652,7 @@ InitControlFile(uint64 sysidentifier)
 	ControlFile->track_commit_timestamp = track_commit_timestamp;
 	ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
+	ControlFile->xmin_for_csn = InvalidTransactionId;
 }
 
 static void
@@ -7211,6 +7212,7 @@ StartupXLOG(void)
 			 * maintained during recovery and need not be started yet.
 			 */
 			StartupSUBTRANS(oldestActiveXID);
+			CSNSnapshotStartup(oldestActiveXID);
 
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
@@ -8030,7 +8032,10 @@ StartupXLOG(void)
 	 * timestamps are started below, if necessary.)
 	 */
 	if (standbyState == STANDBY_DISABLED)
+	{
 		StartupSUBTRANS(oldestActiveXID);
+		CSNSnapshotStartup(oldestActiveXID);
+	}
 
 	/*
 	 * Perform end of recovery actions for any SLRUs that need it.
@@ -9925,6 +9930,8 @@ XLogRestorePoint(const char *rpName)
 static void
 XLogReportParameters(void)
 {
+	TransactionId xmin_for_csn = ControlFile->xmin_for_csn;
+
 	if (wal_level != ControlFile->wal_level ||
 		wal_log_hints != ControlFile->wal_log_hints ||
 		MaxConnections != ControlFile->MaxConnections ||
@@ -9964,11 +9971,12 @@ XLogReportParameters(void)
 			XLogFlush(recptr);
 		}
 
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
-		if (enable_csn_snapshot != ControlFile->enable_csn_snapshot)
-			prepare_csn_env(enable_csn_snapshot);
+		prepare_csn_env(enable_csn_snapshot,
+						enable_csn_snapshot == ControlFile->enable_csn_snapshot,
+						&xmin_for_csn);
 
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->xmin_for_csn = xmin_for_csn;
 		ControlFile->MaxConnections = MaxConnections;
 		ControlFile->max_worker_processes = max_worker_processes;
 		ControlFile->max_wal_senders = max_wal_senders;
@@ -9982,6 +9990,16 @@ XLogReportParameters(void)
 
 		LWLockRelease(ControlFileLock);
 	}
+	else
+	{
+		/*
+		 * When no GUC change, but for xmin_for_csn it should transmit the xmin_for_csn
+		 * from pg_control to csnState->xmin_for_csn. Or it will cause issue when prepare
+		 * transaction exists and with 'xid-snapshot start  ->  csn-snapshot start  ->
+		 * csn-snapshot start' sequence.
+		 */
+		prepare_csn_env(enable_csn_snapshot, true, &xmin_for_csn);
+	}
 }
 
 /*
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index e06bc13841..bbc5f1adf7 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -17,6 +17,7 @@
 #include "access/clog.h"
 #include "access/commit_ts.h"
 #include "access/csn_log.h"
+#include "access/csn_snapshot.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -152,6 +153,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
+		size = add_size(size, CSNSnapshotShmemSize());
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -272,6 +274,7 @@ CreateSharedMemoryAndSemaphores(void)
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	AsyncShmemInit();
+	CSNSnapshotShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 4a200b0815..62138fddcc 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -49,6 +49,7 @@
 
 #include "access/clog.h"
 #include "access/csn_log.h"
+#include "access/csn_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -98,6 +99,9 @@ typedef struct ProcArrayStruct
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
 
+	/* xmin of oldest active csn snapshot */
+	TransactionId csn_snapshot_xmin;
+
 	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
 } ProcArrayStruct;
@@ -419,6 +423,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		procArray->csn_snapshot_xmin = InvalidTransactionId;
 		ShmemVariableCache->xactCompletionCount = 1;
 	}
 
@@ -547,6 +552,14 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		/* Advance global latestCompletedXid while holding the lock */
 		MaintainLatestCompletedXid(latestXid);
 
+		/*
+		 * Assign xid csn while holding ProcArrayLock for non-distributed
+		 * COMMIT PREPARED. After lock is released consequent
+		 * CSNSnapshotCommit() will write this value to CsnLog.
+		 */
+		if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+			pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(InvalidCSN));
+
 		/* Same with xactCompletionCount  */
 		ShmemVariableCache->xactCompletionCount++;
 
@@ -656,6 +669,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 
 		proc->lxid = InvalidLocalTransactionId;
 		proc->xmin = InvalidTransactionId;
+		proc->originalXmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
 
@@ -695,6 +709,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	proc->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
 	proc->recoveryConflictPending = false;
 
@@ -720,6 +735,16 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
 
+	/*
+	 * Assign xid csn while holding ProcArrayLock for
+	 * COMMIT.
+	 *
+	 * TODO: in case of group commit we can generate one CSNSnapshot for
+	 * whole group to save time on timestamp aquisition.
+	 */
+	if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+		pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(InvalidCSN));
+
 	/* Same with xactCompletionCount  */
 	ShmemVariableCache->xactCompletionCount++;
 }
@@ -878,6 +903,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
+	proc->originalXmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
 
 	Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
@@ -1667,6 +1693,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	TransactionId kaxmin;
 	bool		in_recovery = RecoveryInProgress();
 	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -1715,6 +1742,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	 */
 	h->slot_xmin = procArray->replication_slot_xmin;
 	h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
 
 	for (int index = 0; index < arrayP->numProcs; index++)
 	{
@@ -1723,6 +1751,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		int8		statusFlags = ProcGlobal->statusFlags[index];
 		TransactionId xid;
 		TransactionId xmin;
+		TransactionId original_xmin = UINT32_ACCESS_ONCE(proc->originalXmin);
 
 		/* Fetch xid just once - see GetNewTransactionId */
 		xid = UINT32_ACCESS_ONCE(other_xids[index]);
@@ -1735,6 +1764,9 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		 * (yet) an Xid; conversely, if it has an Xid, that could determine
 		 * some not-yet-set Xmin.
 		 */
+		if (TransactionIdIsValid(original_xmin))
+			xmin = original_xmin;
+
 		xmin = TransactionIdOlder(xmin, xid);
 
 		/* if neither is set, this proc doesn't influence the horizon */
@@ -1854,6 +1886,10 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		/* defer doesn't apply to temp relations */
 	}
 
+	if (TransactionIdIsValid(csn_snapshot_xmin) &&
+		TransactionIdOlder(csn_snapshot_xmin, h->shared_oldest_nonremovable))
+		h->shared_oldest_nonremovable = csn_snapshot_xmin;
+
 	/*
 	 * Check whether there are replication slots requiring an older xmin.
 	 */
@@ -2064,6 +2100,8 @@ GetSnapshotDataReuse(Snapshot snapshot)
 	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
 		return false;
 
+	if (get_csnlog_status())
+		return false;
 	/*
 	 * If the current xactCompletionCount is still the same as it was at the
 	 * time the snapshot was built, we can be sure that rebuilding the
@@ -2143,6 +2181,7 @@ GetSnapshotData(Snapshot snapshot)
 	size_t		count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	CSN	csn = FrozenCSN;
 	FullTransactionId latest_completed;
 	TransactionId oldestxid;
 	int			mypgxactoff;
@@ -2151,6 +2190,7 @@ GetSnapshotData(Snapshot snapshot)
 
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -2371,10 +2411,18 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
 
 	if (!TransactionIdIsValid(MyProc->xmin))
 		MyProc->xmin = TransactionXmin = xmin;
 
+	/*
+	 * Take CSN under ProcArrayLock so the snapshot stays
+	 * synchronized.
+	 */
+	if (!snapshot->takenDuringRecovery && get_csnlog_status())
+		csn = GenerateCSN(InvalidCSN);
+
 	LWLockRelease(ProcArrayLock);
 
 	/* maintain state for GlobalVis* */
@@ -2396,6 +2444,9 @@ GetSnapshotData(Snapshot snapshot)
 		def_vis_xid_data =
 			TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age);
 
+		if (TransactionIdIsValid(csn_snapshot_xmin))
+			def_vis_xid_data = TransactionIdOlder(csn_snapshot_xmin, def_vis_xid_data);
+
 		/* Check whether there's a replication slot requiring an older xmin. */
 		def_vis_xid_data =
 			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
@@ -2480,6 +2531,11 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->active_count = 0;
 	snapshot->regd_count = 0;
 	snapshot->copied = false;
+	snapshot->imported_csn = false;
+	snapshot->csn = csn;
+
+	if (csn_snapshot_defer_time > 0 && IsUnderPostmaster)
+		CSNSnapshotMapXmin(snapshot->csn);
 
 	GetSnapshotDataInitOldSnapshot(snapshot);
 
@@ -3832,6 +3888,25 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
+/*
+ * ProcArraySetCSNSnapshotXmin
+ */
+void
+ProcArraySetCSNSnapshotXmin(TransactionId xmin)
+{
+	/* We rely on atomic fetch/store of xid */
+	procArray->csn_snapshot_xmin = xmin;
+}
+
+/*
+ * ProcArrayGetCSNSnapshotXmin
+ */
+TransactionId
+ProcArrayGetCSNSnapshotXmin(void)
+{
+	return procArray->csn_snapshot_xmin;
+}
+
 /*
  * XidCacheRemoveRunningXids
  *
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 56196b2d0d..ac0e6a33fe 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -54,3 +54,4 @@ XactTruncationLock					44
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
 CSNLogControlLock				    48
+CSNSnapshotXidMapLock               49
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 897045ee27..6e66801001 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -37,6 +37,7 @@
 
 #include "access/transam.h"
 #include "access/twophase.h"
+#include "access/csn_snapshot.h"
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -440,6 +441,9 @@ InitProcess(void)
 	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
 	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
 
+	MyProc->originalXmin = InvalidTransactionId;
+	pg_atomic_init_u64(&MyProc->assignedCSN, InProgressCSN);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
@@ -582,6 +586,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
 	MyProc->waitProcLock = NULL;
+	MyProc->originalXmin = InvalidTransactionId;
 	pg_atomic_write_u64(&MyProc->waitStart, 0);
 #ifdef USE_ASSERT_CHECKING
 	{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 14063bcf9e..cc01d72d64 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -30,6 +30,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/gin.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
@@ -3140,6 +3141,26 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"csn_snapshot_defer_time", PGC_POSTMASTER, PRESET_OPTIONS,
+			gettext_noop("Minimal age of records which allowed to be vacuumed, in seconds."),
+			NULL
+		},
+		&csn_snapshot_defer_time,
+		5, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"csn_time_shift", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Do the time shift in the CSN generator."),
+			gettext_noop("Used for debug purposes.")
+		},
+		&csn_time_shift,
+		0, INT_MIN, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"block_size", PGC_INTERNAL, PRESET_OPTIONS,
 			gettext_noop("Shows the size of a disk block."),
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 95704265b6..2d4c5f53af 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -173,6 +174,7 @@ static TimestampTz AlignTimestampToMinuteBoundary(TimestampTz ts);
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
 static void SnapshotResetXmin(void);
+static bool XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
 /*
  * Snapshot fields to be serialized.
@@ -191,6 +193,8 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	CSN			csn;
+	bool		imported_csn;
 } SerializedSnapshotData;
 
 Size
@@ -291,6 +295,7 @@ GetTransactionSnapshot(void)
 				CurrentSnapshot = GetSerializableTransactionSnapshot(&CurrentSnapshotData);
 			else
 				CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
+
 			/* Make a saved copy */
 			CurrentSnapshot = CopySnapshot(CurrentSnapshot);
 			FirstXactSnapshot = CurrentSnapshot;
@@ -2115,6 +2120,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.csn = snapshot->csn;
+	serialized_snapshot.imported_csn = snapshot->imported_csn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -2190,6 +2197,8 @@ RestoreSnapshot(char *start_address)
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
 	snapshot->snapXactCompletionCount = 0;
+	snapshot->csn = serialized_snapshot.csn;
+	snapshot->imported_csn = serialized_snapshot.imported_csn;
 
 	/* Copy XIDs, if present. */
 	if (serialized_snapshot.xcnt > 0)
@@ -2230,6 +2239,44 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 
 /*
  * XidInMVCCSnapshot
+ *
+ * Check whether this xid is in snapshot. When enable_csn_snapshot is
+ * switched off just call XidInLocalMVCCSnapshot().
+ */
+bool
+XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	bool in_snapshot;
+
+	if (snapshot->imported_csn)
+	{
+		Assert(enable_csn_snapshot);
+		/* No point to using snapshot info except CSN */
+		return XidInvisibleInCSNSnapshot(xid, snapshot);
+	}
+
+	in_snapshot = XidInLocalMVCCSnapshot(xid, snapshot);
+
+	if (!get_csnlog_status())
+	{
+		Assert(CSNIsFrozen(snapshot->csn));
+		return in_snapshot;
+	}
+
+	if (in_snapshot)
+	{
+		/*
+		 * This xid may be already in unknown state and in that case
+		 * we must wait and recheck.
+		 */
+		return XidInvisibleInCSNSnapshot(xid, snapshot);
+	}
+	else
+		return false;
+}
+
+/*
+ * XidInLocalMVCCSnapshot
  *		Is the given XID still-in-progress according to the snapshot?
  *
  * Note: GetSnapshotData never stores either top xid or subxids of our own
@@ -2238,8 +2285,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
  * TransactionIdIsCurrentTransactionId first, except when it's known the
  * XID could not be ours anyway.
  */
-bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+static bool
+XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
 	uint32		i;
 
@@ -2349,3 +2396,98 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+/*
+ * ExportCSNSnapshot
+ *
+ * Export csn so that caller can expand this transaction to other
+ * nodes.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+CSN
+ExportCSNSnapshot()
+{
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not export csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	return CurrentSnapshot->csn;
+}
+
+/* SQL accessor to ExportCSNSnapshot() */
+Datum
+pg_csn_snapshot_export(PG_FUNCTION_ARGS)
+{
+	CSN	export_csn = ExportCSNSnapshot();
+	PG_RETURN_UINT64(export_csn);
+}
+
+/*
+ * ImportCSNSnapshot
+ *
+ * Import csn and retract this backends xmin to the value that was
+ * actual when we had such csn.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+void
+ImportCSNSnapshot(CSN csn)
+{
+	volatile TransactionId xmin;
+
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	if (csn_snapshot_defer_time <= 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is positive.",
+					 "csn_snapshot_defer_time")));
+
+	/*
+	 * Call CSNSnapshotToXmin under ProcArrayLock to avoid situation that
+	 * resulting xmin will be evicted from map before we will set it into our
+	 * backend's xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	xmin = CSNSnapshotToXmin(csn);
+
+	if (!TransactionIdIsValid(xmin))
+	{
+		LWLockRelease(ProcArrayLock);
+		elog(ERROR, "CSNSnapshotToXmin: csn snapshot too old: %lu", csn);
+	}
+
+	MyProc->originalXmin = MyProc->xmin;
+	MyProc->xmin = TransactionXmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	CurrentSnapshot->xmin = xmin; /* defuse SnapshotResetXmin() */
+	CurrentSnapshot->csn = csn;
+	CurrentSnapshot->imported_csn = true;
+
+	CSNSnapshotSync(csn);
+}
+
+/* SQL accessor to ImportCSNSnapshot() */
+Datum
+pg_csn_snapshot_import(PG_FUNCTION_ARGS)
+{
+	CSN	csn = PG_GETARG_UINT64(0);
+
+	ImportCSNSnapshot(csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/bin/pg_waldump/csnlogdesc.c b/src/bin/pg_waldump/csnlogdesc.c
new file mode 120000
index 0000000000..dcde44b3ee
--- /dev/null
+++ b/src/bin/pg_waldump/csnlogdesc.c
@@ -0,0 +1 @@
+../../../src/backend/access/rmgrdesc/csnlogdesc.c
\ No newline at end of file
diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h
index 01fb0daad4..9857724dca 100644
--- a/src/include/access/csn_log.h
+++ b/src/include/access/csn_log.h
@@ -71,7 +71,8 @@ extern void WriteAssignCSNXlogRec(CSN csn);
 extern CSN set_last_max_csn(CSN csn, bool force);
 extern CSN get_last_max_csn(void);
 extern CSN get_xmin_for_csn(void);
-extern void prepare_csn_env(bool enable_csn_snapshot);
+extern void prepare_csn_env(bool enable_csn_snapshot, bool same,
+							TransactionId *xmin_for_csn_in_control);
 extern void CatchCSNLog(void);
 extern void ActivateCSNlog(void);
 extern void DeactivateCSNlog(void);
diff --git a/src/include/access/csn_snapshot.h b/src/include/access/csn_snapshot.h
new file mode 100644
index 0000000000..3a739e6744
--- /dev/null
+++ b/src/include/access/csn_snapshot.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.h
+ *	  Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_snapshot.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CSN_SNAPSHOT_H
+#define CSN_SNAPSHOT_H
+
+#include "port/atomics.h"
+#include "storage/lock.h"
+#include "utils/snapshot.h"
+#include "utils/guc.h"
+
+/*
+ * snapshot.h is used in frontend code so atomic variant of CSN type
+ * is defined here.
+ */
+typedef pg_atomic_uint64 CSN_atomic;
+
+#define InProgressCSN	 	UINT64CONST(0x0)
+#define AbortedCSN	 		UINT64CONST(0x1)
+#define FrozenCSN		 	UINT64CONST(0x2)
+#define InDoubtCSN	 		UINT64CONST(0x3)
+#define UnclearCSN	 		UINT64CONST(0x4)
+#define FirstNormalCSN 		UINT64CONST(0x5)
+
+#define CSNIsInProgress(csn)	((csn) == InProgressCSN)
+#define CSNIsAborted(csn)		((csn) == AbortedCSN)
+#define CSNIsFrozen(csn)		((csn) == FrozenCSN)
+#define CSNIsInDoubt(csn)		((csn) == InDoubtCSN)
+#define CSNIsUnclear(csn)		((csn) == UnclearCSN)
+#define CSNIsNormal(csn)		((csn) >= FirstNormalCSN)
+
+
+extern int csn_snapshot_defer_time;
+extern int csn_time_shift;
+
+
+extern Size CSNSnapshotShmemSize(void);
+extern void CSNSnapshotShmemInit(void);
+
+extern void CSNSnapshotStartup(TransactionId oldestActiveXID);
+extern void CSNSnapshotMapXmin(CSN snapshot_csn);
+extern TransactionId CSNSnapshotToXmin(CSN snapshot_csn);
+extern CSN GenerateCSN(CSN assign);
+extern bool XidInvisibleInCSNSnapshot(TransactionId xid, Snapshot snapshot);
+extern CSN TransactionIdGetCSN(TransactionId xid);
+extern void CSNSnapshotAbort(PGPROC *proc, TransactionId xid, int nsubxids,
+								TransactionId *subxids);
+extern void CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotCommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotAssignCurrent(CSN snapshot_csn);
+extern CSN CSNSnapshotPrepareCurrent(void);
+extern void CSNSnapshotSync(CSN remote_csn);
+
+#endif							/* CSN_SNAPSHOT_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 194d100ed4..fbd63f50d9 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -185,6 +185,12 @@ typedef struct ControlFileData
 	bool		track_commit_timestamp;
 	bool		enable_csn_snapshot;
 
+	/*
+	 * Used to record a xmin when database startup with a snapshot-switch to csn snapshot,
+	 * and will hold the value until it switch to xid-snapshot.
+	 */
+	TransactionId xmin_for_csn;
+
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
 	 * the database and the backend executable.  We need not check endianness
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index b9f4afba05..8585464c5b 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11411,4 +11411,18 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+# csn shnapshot handling
+{ oid => '10001', descr => 'export csn snapshot',
+  proname => 'pg_csn_snapshot_export', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_csn_snapshot_export' },
+{ oid => '10002', descr => 'import csn snapshot',
+  proname => 'pg_csn_snapshot_import', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'int8', prosrc => 'pg_csn_snapshot_import' },
+{ oid => '10003', descr => 'prepare distributed transaction for commit, get csn',
+  proname => 'pg_csn_snapshot_prepare', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => 'text', prosrc => 'pg_csn_snapshot_prepare' },
+{ oid => '10004', descr => 'assign csn to distributed transaction',
+  proname => 'pg_csn_snapshot_assign', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_csn_snapshot_assign' },
+
 ]
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 99873497a6..d8a681aaa4 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -92,6 +92,8 @@ typedef struct
 #define USECS_PER_HOUR	INT64CONST(3600000000)
 #define USECS_PER_MINUTE INT64CONST(60000000)
 #define USECS_PER_SEC	INT64CONST(1000000)
+#define NSECS_PER_SEC	INT64CONST(1000000000)
+#define NSECS_PER_USEC	INT64CONST(1000)
 
 /*
  * We allow numeric timezone offsets up to 15:59:59 either way from Greenwich.
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index ab7b85c86e..f08999740b 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -281,6 +281,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_GETARG_FLOAT4(n)  DatumGetFloat4(PG_GETARG_DATUM(n))
 #define PG_GETARG_FLOAT8(n)  DatumGetFloat8(PG_GETARG_DATUM(n))
 #define PG_GETARG_INT64(n)	 DatumGetInt64(PG_GETARG_DATUM(n))
+#define PG_GETARG_UINT64(n)	 DatumGetUInt64(PG_GETARG_DATUM(n))
 /* use this if you want the raw, possibly-toasted input datum: */
 #define PG_GETARG_RAW_VARLENA_P(n)	((struct varlena *) PG_GETARG_POINTER(n))
 /* use this if you want the input datum de-toasted: */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index f0e920f462..6edb492638 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -15,6 +15,7 @@
 #define _PROC_H_
 
 #include "access/clog.h"
+#include "access/csn_snapshot.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
 #include "utils/snapshot.h"
@@ -252,6 +253,18 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/*
+	 * assignedCSN holds CSN for this transaction.  It is generated
+	 * under a ProcArray lock and later is writter to a CSNLog.  This
+	 * variable defined as atomic only for case of group commit, in all other
+	 * scenarios only backend responsible for this proc entry is working with
+	 * this variable.
+	 */
+	CSN_atomic assignedCSN;
+
+	/* Original xmin of this backend before csn snapshot was imported */
+	TransactionId originalXmin;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index b01fa52139..755a0743b6 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -94,4 +94,7 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
 
+extern void ProcArraySetCSNSnapshotXmin(TransactionId xmin);
+extern TransactionId ProcArrayGetCSNSnapshotXmin(void);
+
 #endif							/* PROCARRAY_H */
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index f66ac58188..c339eb7384 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -129,6 +129,8 @@ extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
 
+extern CSN ExportCSNSnapshot(void);
+extern void ImportCSNSnapshot(CSN csn);
 extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 92575a207f..98cff1d872 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -217,7 +217,8 @@ typedef struct SnapshotData
 	 * CSN for snapshot isolation support.
 	 * Will be used only if enable_csn_snapshot is enabled.
 	 */
-	CSN	csn;
+	CSN			csn;
+	bool		imported_csn;
 
 	/*
 	 * The transaction completion count at the time GetSnapshotData() built
-- 
2.25.1

0003-CSN-Snapshot-Tests.patchtext/x-patch; charset=UTF-8; name=0003-CSN-Snapshot-Tests.patchDownload

From fa93fd78d70742d254d1972cc23d05006c19d4af Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Fri, 29 Jan 2021 10:37:43 +0500
Subject: [PATCH 3/4] CSN Snapshot Tests

---
 src/test/modules/Makefile                     |   1 +
 src/test/modules/csnsnapshot/Makefile         |  22 ++
 .../modules/csnsnapshot/csn_snapshot.conf     |   1 +
 src/test/modules/csnsnapshot/t/001_base.pl    | 119 ++++++++++
 src/test/modules/csnsnapshot/t/002_standby.pl |  66 ++++++
 .../modules/csnsnapshot/t/003_time_skew.pl    | 211 ++++++++++++++++++
 .../csnsnapshot/t/004_read_committed.pl       |  97 ++++++++
 7 files changed, 517 insertions(+)
 create mode 100644 src/test/modules/csnsnapshot/Makefile
 create mode 100644 src/test/modules/csnsnapshot/csn_snapshot.conf
 create mode 100644 src/test/modules/csnsnapshot/t/001_base.pl
 create mode 100644 src/test/modules/csnsnapshot/t/002_standby.pl
 create mode 100644 src/test/modules/csnsnapshot/t/003_time_skew.pl
 create mode 100644 src/test/modules/csnsnapshot/t/004_read_committed.pl

diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 93e7829c67..353c57eb51 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
 SUBDIRS = \
 		  brin \
 		  commit_ts \
+		  csnsnapshot \
 		  delay_execution \
 		  dummy_index_am \
 		  dummy_seclabel \
diff --git a/src/test/modules/csnsnapshot/Makefile b/src/test/modules/csnsnapshot/Makefile
new file mode 100644
index 0000000000..e7c1195d0f
--- /dev/null
+++ b/src/test/modules/csnsnapshot/Makefile
@@ -0,0 +1,22 @@
+# src/test/modules/csnsnapshot/Makefile
+
+REGRESS_OPTS = --temp-config=$(top_srcdir)/src/test/modules/csnsnapshot/csn_snapshot.conf
+NO_INSTALLCHECK = 1
+
+TAP_TESTS = 1
+EXTRA_INSTALL=contrib/postgres_fdw
+
+ifndef CSN_ALL
+PROVE_TESTS=t/001_base.pl t/002_standby.pl t/003_time_skew.pl
+endif
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/csnsnapshot
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/csnsnapshot/csn_snapshot.conf b/src/test/modules/csnsnapshot/csn_snapshot.conf
new file mode 100644
index 0000000000..e9d3c35756
--- /dev/null
+++ b/src/test/modules/csnsnapshot/csn_snapshot.conf
@@ -0,0 +1 @@
+track_commit_timestamp = on
diff --git a/src/test/modules/csnsnapshot/t/001_base.pl b/src/test/modules/csnsnapshot/t/001_base.pl
new file mode 100644
index 0000000000..a89b1beeea
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/001_base.pl
@@ -0,0 +1,119 @@
+# Single-node test: value can be set, and is still present after recovery
+
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 6;
+use PostgresNode;
+
+my $node = get_new_node('csntest');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					});
+
+my $result1;
+my $result2;
+$node->start;
+
+# Check CSN increased monotonically after restart
+$result1 = $node->safe_psql('postgres', "SHOW csn_time_shift");
+note("csn_time_shift: $result1");
+$result1 = $node->safe_psql('postgres', 'SELECT pg_csn_snapshot_export()');
+note("Snapshot CSN: $result1");
+$node->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = -100");
+$node->restart;
+$result2 = $node->safe_psql('postgres', 'SHOW csn_time_shift');
+note("csn_time_shift: $result2");
+$result2 = $node->safe_psql('postgres', 'SELECT pg_csn_snapshot_export()');
+note("Snapshot CSN after restart: $result2");
+is($result1 < $result2, 1, 'CSN monotonically increases');
+
+$node->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 0");
+$node->restart;
+
+# Create a table
+$node->safe_psql('postgres', 'create table t1(i int, j int)');
+
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(1,1)');
+# export csn snapshot
+my $test_snapshot = $node->safe_psql('postgres', 'select pg_csn_snapshot_export()');
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(2,1)');
+
+my $count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '2', 'Get right number in normal query');
+my $count2 = $node->safe_psql('postgres', "
+			begin transaction isolation level repeatable read;
+			select pg_csn_snapshot_import($test_snapshot);
+			select count(*) from t1;
+			commit;"
+			);
+
+is($count2, '
+1', 'Get right number in csn import query');
+
+#prepare transaction test
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(3,1);
+						insert into t1 values(3,2);
+						prepare	transaction 'pt3';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(4,1);
+						insert into t1 values(4,2);
+						prepare	transaction 'pt4';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(5,1);
+						insert into t1 values(5,2);
+						prepare	transaction 'pt5';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(6,1);
+						insert into t1 values(6,2);
+						prepare	transaction 'pt6';
+						");
+$node->safe_psql('postgres', "commit prepared 'pt4';");
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = off");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(7,1);
+						insert into t1 values(7,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt3';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '8', 'Get right number in normal query');
+
+
+# restart with enable_csn_snapshot on
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(8,1);
+						insert into t1 values(8,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt5';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '12', 'Get right number in normal query');
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(9,1);
+						insert into t1 values(9,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt6';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '16', 'Get right number in normal query');
diff --git a/src/test/modules/csnsnapshot/t/002_standby.pl b/src/test/modules/csnsnapshot/t/002_standby.pl
new file mode 100644
index 0000000000..b7c4ea93b2
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/002_standby.pl
@@ -0,0 +1,66 @@
+# Test simple scenario involving a standby
+
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 6;
+use PostgresNode;
+
+my $bkplabel = 'backup';
+my $master   = get_new_node('master');
+$master->init(allows_streaming => 1);
+
+$master->append_conf(
+	'postgresql.conf', qq{
+	enable_csn_snapshot = on
+	max_wal_senders = 5
+	});
+$master->start;
+$master->backup($bkplabel);
+
+my $standby = get_new_node('standby');
+$standby->init_from_backup($master, $bkplabel, has_streaming => 1);
+$standby->start;
+
+$master->safe_psql('postgres', "create table t1(i int, j int)");
+
+my $guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'on', "GUC on master");
+
+my $guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'off', "GUC off master");
+
+$guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = on');
+$master->restart;
+
+my $count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '4096', "Ok for siwtch xid-base > csn-base"); #4096
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '8192', "Ok for siwtch csn-base > xid-base"); #8192
\ No newline at end of file
diff --git a/src/test/modules/csnsnapshot/t/003_time_skew.pl b/src/test/modules/csnsnapshot/t/003_time_skew.pl
new file mode 100644
index 0000000000..b01ad1a7c0
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/003_time_skew.pl
@@ -0,0 +1,211 @@
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 13;
+use PostgresNode;
+
+my $node1 = get_new_node('csn1');
+$node1->init;
+$node1->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					});
+$node1->start;
+my $node2 = get_new_node('csn2');
+$node2->init;
+$node2->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					});
+$node2->start;
+
+$node1->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node2->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE TABLE summary(value int, ntrans int);
+	INSERT INTO summary (value, ntrans) VALUES (0, 0);
+");
+$node2->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node1->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE FOREIGN TABLE summary(value int, ntrans int) SERVER remote;
+");
+
+$node1->safe_psql('postgres', "
+	CREATE TABLE t (id int, payload int) PARTITION BY HASH(id);
+	CREATE TABLE t_0 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 0);
+	CREATE FOREIGN TABLE t_1 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 1) SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE TABLE t (id serial, payload int) PARTITION BY HASH(id);
+	CREATE TABLE t_1 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 1);
+	CREATE FOREIGN TABLE t_0 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 0) SERVER remote;
+");
+
+$node1->safe_psql('postgres', "INSERT INTO t(id, payload) (SELECT gs.*, 1 FROM generate_series(1,100) AS gs)");
+$node2->safe_psql('postgres', "INSERT INTO t(id, payload) (SELECT gs.*, 2 FROM generate_series(101,200) AS gs)");
+my $count1 = $node1->safe_psql('postgres', "SELECT SUM(payload) FROM t");
+my $count2 = $node2->safe_psql('postgres', "SELECT SUM(payload) FROM t");
+is( (($count1 == 300) and ($count1 == $count2)), 1, 'Correct insert');
+
+# ##############################################################################
+#
+# Basic test. Check REPEATABLE READ anomaly.
+# ntrans is needed to control that some transactions were committed.
+#
+# ##############################################################################
+
+my $q1 = File::Temp->new();
+append_to_file($q1, q{
+	START TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+	UPDATE summary SET value = value + (SELECT SUM(payload) FROM t);
+	UPDATE summary SET value = value - (SELECT SUM(payload) FROM t);
+	UPDATE summary SET ntrans = ntrans + 1;
+	COMMIT;
+});
+my $q2 = File::Temp->new();
+append_to_file($q2, q{
+	BEGIN;
+	\set pl random(-100, 100)
+	\set id random(1, 200)
+	UPDATE t SET payload = :pl WHERE id = :id;
+	COMMIT;
+});
+
+my $seconds = 5;
+my $pgb_handle1 = $node1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $q1, 'postgres' );
+my $pgb_handle2 = $node2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count1 = $node1->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+$count2 = $node2->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+my $ntrans = $node2->safe_psql('postgres', "SELECT SUM(ntrans) FROM summary");
+note("$count1, $count2, $ntrans");
+is( ( ($ntrans > 0) and ($count1 == 0) and ($count1 == $count2)), 1, 'Correct update');
+
+# ##############################################################################
+#
+# Test on 'snapshot too old'
+#
+# ##############################################################################
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 0;");
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 20");
+$node1->restart();
+
+# READ COMMITTED transactions ignores the time skew.
+$node2->psql('postgres', "UPDATE summary SET ntrans = 1");
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("$ntrans");
+is( $ntrans, 1, 'Read committed behavior if snapshot turn sour');
+
+# But REPEATABLE READ transactions isn't
+my $err = '';
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 40");
+$node1->restart();
+$node2->psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 2; COMMIT;", stderr => \$err);
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("$ntrans");
+is( (($ntrans == 1) and (index($err, 'csn snapshot too old') != -1)), 1, 'Read committed can\'t update if snapshot turn sour');
+
+# ##############################################################################
+#
+# Test on issue #1:
+# 'xact confirmed as committed, so any following xact must see its effects'.
+#
+# ##############################################################################
+$node1->safe_psql('postgres', "delete from t");
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 45");
+$node2->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 40");
+$node1->restart();
+$node2->restart();
+
+my $st_sec; my $end_sec;
+my $time_diff;
+
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; INSERT INTO t VALUES(1,1), (3,1); COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 2, 'Slow node can see mix node data change');
+$ntrans = $node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 2, 'Fast node can see mix node data change');
+
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; INSERT INTO t VALUES(1,1); COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 3, 'CURRENTLY FAILED:Data change to fast node on slow node, and slow node can see data change');
+
+# READ COMMITED mode ignores the time skew.
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 1");
+$ntrans = $node2->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("ntrans: $ntrans\n");
+is( $ntrans, 1, 'See committed values in the READ COMMITTED mode');
+
+# Access from the future
+$node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = ntrans + 1; COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+note("ntrans: $ntrans\n");
+is( $ntrans, 1, 'Do not see values, committed in the future at the REPEATABLE READ mode');
+
+# But...
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 50");
+$node2->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 55");
+$node1->restart();
+$node2->restart();
+
+# Check READ COMMITED mode
+$node2->safe_psql('postgres', "UPDATE summary SET ntrans = 2");
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("ntrans: $ntrans\n");
+is( $ntrans, 2, 'See committed values in the READ COMMITTED mode, step 2');
+
+# Node from the future will wait for a time before UPDATE table.
+($st_sec) = localtime();
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 3; COMMIT;");
+($end_sec) = localtime(); $time_diff = $end_sec - $st_sec;
+$ntrans = $node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+note("ntrans: $ntrans, Test time: $time_diff seconds");
+is( ($ntrans == 3), 1, 'The test execution time correlates with the time offset.');
+
+# Node from the future will wait for a time before SELECT from a table.
+$node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 4; COMMIT;");
+($st_sec) = localtime();
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+($end_sec) = localtime(); $time_diff = $end_sec - $st_sec;
+note("ntrans: $ntrans, Test time: $time_diff seconds ($end_sec, $st_sec)");
+is( ($ntrans == 4), 1, 'See values, committed in the past. The test execution time correlates with the time offset.');
+
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 0, value = 0");
+$q1 = File::Temp->new();
+append_to_file($q1, q{
+	UPDATE summary SET value = value + 1, ntrans = ntrans + 1;
+});
+$q2 = File::Temp->new();
+append_to_file($q2, q{
+	START TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+	UPDATE summary SET value = value + (SELECT SUM(ntrans) FROM summary);
+	UPDATE summary SET value = value - (SELECT SUM(ntrans) FROM summary);
+	COMMIT;
+});
+$seconds = 3;
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 1, -T => $seconds, -f => $q1, 'postgres' );
+$pgb_handle2 = $node2->pgbench_async(-n, -c => 1, -T => $seconds, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count1 = $node1->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+$count2 = $node1->safe_psql('postgres', "SELECT SUM(ntrans) FROM summary");
+note("$count1, $count2");
+is( ( ($count1 > 0) and ($count1 == $count2)), 1, 'Skew test');
+
+$node1->stop();
+$node2->stop();
diff --git a/src/test/modules/csnsnapshot/t/004_read_committed.pl b/src/test/modules/csnsnapshot/t/004_read_committed.pl
new file mode 100644
index 0000000000..0918212bd8
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/004_read_committed.pl
@@ -0,0 +1,97 @@
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 2;
+use PostgresNode;
+
+my $node1 = get_new_node('csn1');
+$node1->init;
+$node1->append_conf('postgresql.conf', qq{
+					max_prepared_transactions = 20
+					shared_preload_libraries = 'postgres_fdw'
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					postgres_fdw.use_csn_snapshots = true
+					csn_time_shift = 0
+					});
+$node1->start;
+my $node2 = get_new_node('csn2');
+$node2->init;
+$node2->append_conf('postgresql.conf', qq{
+					max_prepared_transactions = 20
+					shared_preload_libraries = 'postgres_fdw'
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					postgres_fdw.use_csn_snapshots = true
+					csn_time_shift = 0
+					});
+$node2->start;
+
+# Create foreign servers
+$node1->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node2->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node1->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+");
+
+# Create sharded table
+$node1->safe_psql('postgres', "
+	CREATE TABLE dept1(name TEXT);
+	CREATE FOREIGN TABLE dept2 (name TEXT) SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE TABLE dept2(name TEXT);
+	CREATE FOREIGN TABLE dept1 (name TEXT) SERVER remote;
+	CREATE TABLE results(success_tx int);
+	INSERT INTO results (success_tx) VALUES (0);
+");
+
+# Fill the table
+$node1->safe_psql('postgres', "INSERT INTO dept1 (name) VALUES ('Jonathan')");
+$node1->safe_psql('postgres', "INSERT INTO dept2 (name) VALUES ('Hoshi')");
+$node2->safe_psql('postgres', "INSERT INTO dept1 (name) VALUES ('Leonard')");
+my $count1 = $node1->safe_psql('postgres', "SELECT count(*) FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a");
+my $count2 = $node2->safe_psql('postgres', "SELECT count(*) FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a");
+note("$count1, $count2");
+is( (($count1 == 3) and ($count1 == $count2)), 1, 'Correct insert');
+
+# Queries
+my $q1 = File::Temp->new();
+append_to_file($q1, q{
+	BEGIN;
+	SELECT count(*) AS cnt FROM dept1; \gset
+	\if :cnt > 0
+		INSERT INTO dept2 (SELECT * FROM dept1);
+		DELETE FROM dept1;
+	\else
+		INSERT INTO dept1 (SELECT * FROM dept2);
+		DELETE FROM dept2;
+	\endif
+
+	COMMIT;
+});
+my $q2 = File::Temp->new();
+append_to_file($q2, q{
+	SELECT count(*) AS cnt FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a; \gset
+	\if :cnt = 3
+		UPDATE results SET success_tx = success_tx + 1;
+	\endif
+});
+my $transactions = 1000;
+my $pgb_handle1 = $node1->pgbench_async(-n, -c => 1, -t => $transactions, -f => $q1, 'postgres' );
+my $pgb_handle2 = $node2->pgbench_async(-n, -c => 20, -t => $transactions, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count2 = $node2->safe_psql('postgres', "SELECT success_tx FROM results");
+note("$count2");
+is( $count2, 20*$transactions, 'Correct READ COMMITTED updates');
+
+$node1->stop();
+$node2->stop();
-- 
2.25.1

0004-Clock-SI-implementation.patchtext/x-patch; charset=UTF-8; name=0004-Clock-SI-implementation.patchDownload

From 52425146d9484b1b19197a5b362ce9dfd2d69b23 Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Mon, 22 Mar 2021 16:37:39 +0500
Subject: [PATCH 4/4] Clock SI implementation

---
 contrib/postgres_fdw/Makefile                 |   9 +
 contrib/postgres_fdw/connection.c             | 323 ++++++++++++++++--
 contrib/postgres_fdw/postgres_fdw.c           |  12 +
 contrib/postgres_fdw/postgres_fdw.h           |   2 +
 .../postgres_fdw/t/001_bank_coordinator.pl    | 264 ++++++++++++++
 .../postgres_fdw/t/002_bank_participant.pl    | 240 +++++++++++++
 src/backend/access/transam/xact.c             |   7 +
 src/include/catalog/pg_proc.dat               |   3 +
 src/test/perl/PostgresNode.pm                 |  35 ++
 9 files changed, 871 insertions(+), 24 deletions(-)
 create mode 100644 contrib/postgres_fdw/t/001_bank_coordinator.pl
 create mode 100644 contrib/postgres_fdw/t/002_bank_participant.pl

diff --git a/contrib/postgres_fdw/Makefile b/contrib/postgres_fdw/Makefile
index c1b0cad453..539b7c4c55 100644
--- a/contrib/postgres_fdw/Makefile
+++ b/contrib/postgres_fdw/Makefile
@@ -29,3 +29,12 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+# Global makefile will do temp-install for 'check'. Since REGRESS is defined,
+# PGXS (included from contrib-global.mk or directly) will care to add
+# postgres_fdw to it as EXTRA_INSTALL and build pg_regress. It will also
+# actually run pg_regress, so the only thing left is tap tests.
+check: tapcheck
+
+tapcheck: temp-install
+	$(prove_check)
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index ee0b4acf0b..0fadef60ac 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -12,8 +12,10 @@
  */
 #include "postgres.h"
 
+#include "access/csn_snapshot.h"
 #include "access/htup_details.h"
 #include "access/xact.h"
+#include "access/xlog.h" /* GetSystemIdentifier() */
 #include "catalog/pg_user_mapping.h"
 #include "commands/defrem.h"
 #include "funcapi.h"
@@ -28,6 +30,8 @@
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/snapshot.h"
 #include "utils/syscache.h"
 
 /*
@@ -69,6 +73,21 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/*
+ * FdwTransactionState
+ *
+ * Holds number of open remote transactions and shared state
+ * needed for all connection entries.
+ */
+typedef struct FdwTransactionState
+{
+	char		*gid;
+	int			nparticipants;
+	CSN			csn;
+	bool		two_phase_commit;
+} FdwTransactionState;
+static FdwTransactionState *fdwTransState;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -76,6 +95,9 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/* counter of prepared tx made by this backend */
+static int two_phase_xact_count = 0;
+
 /*
  * SQL functions
  */
@@ -92,6 +114,7 @@ static void configure_remote_session(PGconn *conn);
 static void do_sql_command(PGconn *conn, const char *sql);
 static void begin_remote_xact(ConnCacheEntry *entry);
 static void pgfdw_xact_callback(XactEvent event, void *arg);
+static void deallocate_prepared_stmts(ConnCacheEntry *entry);
 static void pgfdw_subxact_callback(SubXactEvent event,
 								   SubTransactionId mySubid,
 								   SubTransactionId parentSubid,
@@ -148,6 +171,15 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 									  pgfdw_inval_callback, (Datum) 0);
 	}
 
+	/* allocate FdwTransactionState */
+	if (fdwTransState == NULL)
+	{
+		MemoryContext oldcxt;
+		oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+		fdwTransState = palloc0(sizeof(FdwTransactionState));
+		MemoryContextSwitchTo(oldcxt);
+	}
+
 	/* Set flag that we did GetConnection during the current transaction */
 	xact_got_connection = true;
 
@@ -531,7 +563,8 @@ configure_remote_session(PGconn *conn)
 }
 
 /*
- * Convenience subroutine to issue a non-data-returning SQL command to remote
+ * Convenience subroutine to issue a non-data-returning SQL command or
+ * statement to remote node.
  */
 static void
 do_sql_command(PGconn *conn, const char *sql)
@@ -541,7 +574,8 @@ do_sql_command(PGconn *conn, const char *sql)
 	if (!PQsendQuery(conn, sql))
 		pgfdw_report_error(ERROR, NULL, conn, false, sql);
 	res = pgfdw_get_result(conn, sql);
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	if (PQresultStatus(res) != PGRES_COMMAND_OK &&
+			PQresultStatus(res) != PGRES_TUPLES_OK)
 		pgfdw_report_error(ERROR, res, conn, true, sql);
 	PQclear(res);
 }
@@ -569,6 +603,10 @@ begin_remote_xact(ConnCacheEntry *entry)
 		elog(DEBUG3, "starting remote transaction on connection %p",
 			 entry->conn);
 
+		if (UseCSNSnapshots && (/*!IsolationUsesXactSnapshot() ||*/
+								   IsolationIsSerializable()))
+			elog(ERROR, "Global snapshots support only REPEATABLE READ");
+
 		if (IsolationIsSerializable())
 			sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
 		else
@@ -577,6 +615,23 @@ begin_remote_xact(ConnCacheEntry *entry)
 		do_sql_command(entry->conn, sql);
 		entry->xact_depth = 1;
 		entry->changing_xact_state = false;
+
+		if (UseCSNSnapshots && IsolationUsesXactSnapshot())
+		{
+			char import_sql[128];
+
+			/* Export our snapshot */
+			if (fdwTransState->csn == 0)
+				fdwTransState->csn = ExportCSNSnapshot();
+
+			snprintf(import_sql, sizeof(import_sql),
+				"SELECT pg_csn_snapshot_import("UINT64_FORMAT")",
+				fdwTransState->csn);
+
+			do_sql_command(entry->conn, import_sql);
+		}
+
+		fdwTransState->nparticipants += 1;
 	}
 
 	/*
@@ -784,6 +839,98 @@ pgfdw_report_error(int elevel, PGresult *res, PGconn *conn,
 	PG_END_TRY();
 }
 
+/* Callback typedef for BroadcastStmt */
+typedef bool (*BroadcastCmdResHandler) (PGresult *result, void *arg);
+
+/*
+ * Broadcast sql in parallel to all ConnectionHash entries
+ */
+static bool
+BroadcastStmt(char const * sql, unsigned expectedStatus,
+				BroadcastCmdResHandler handler, void *arg)
+{
+	HASH_SEQ_STATUS scan;
+	ConnCacheEntry *entry;
+	bool		allOk = true;
+
+	/* Broadcast sql */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		pgfdw_reject_incomplete_xact_state_change(entry);
+
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			if (!PQsendQuery(entry->conn, sql))
+			{
+				PGresult   *res = PQgetResult(entry->conn);
+
+				elog(WARNING, "Failed to send command %s", sql);
+				pgfdw_report_error(WARNING, res, entry->conn, true, sql);
+				PQclear(res);
+			}
+		}
+	}
+
+	/* Collect responses */
+	hash_seq_init(&scan, ConnectionHash);
+	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
+	{
+		if (entry->xact_depth > 0 && entry->conn != NULL)
+		{
+			PGresult *result = PQgetResult(entry->conn);
+
+			if (PQresultStatus(result) != expectedStatus ||
+				(handler && !handler(result, arg)))
+			{
+				elog(WARNING,
+					 "Failed command %s: status=%d, expected status=%d",
+					 sql, PQresultStatus(result), expectedStatus);
+				pgfdw_report_error(ERROR, result, entry->conn, true, sql);
+				allOk = false;
+			}
+			PQclear(result);
+			PQgetResult(entry->conn);	/* consume NULL result */
+		}
+	}
+
+	return allOk;
+}
+
+/* Wrapper for broadcasting commands */
+static bool
+BroadcastCmd(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_COMMAND_OK, NULL, NULL);
+}
+
+/* Wrapper for broadcasting statements */
+static bool
+BroadcastFunc(char const *sql)
+{
+	return BroadcastStmt(sql, PGRES_TUPLES_OK, NULL, NULL);
+}
+
+/* Callback for selecting maximal csn */
+static bool
+MaxCsnCB(PGresult *result, void *arg)
+{
+	char		   *resp;
+	CSN	   *max_csn = (CSN *) arg;
+	CSN		csn = 0;
+
+	resp = PQgetvalue(result, 0, 0);
+
+	if (resp == NULL || (*resp) == '\0' ||
+			sscanf(resp, UINT64_FORMAT, &csn) != 1)
+		return false;
+
+	if (*max_csn < csn)
+		*max_csn = csn;
+
+	return true;
+}
+
 /*
  * pgfdw_xact_callback --- cleanup at main-transaction end.
  *
@@ -801,6 +948,104 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	if (!xact_got_connection)
 		return;
 
+	/* Handle possible two-phase commit */
+	if (event == XACT_EVENT_PARALLEL_PRE_COMMIT || event == XACT_EVENT_PRE_COMMIT)
+	{
+		bool include_local_tx = false;
+
+		/* Should we take into account this node? */
+		if (TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		{
+			include_local_tx = true;
+			fdwTransState->nparticipants += 1;
+		}
+
+		/* Switch to 2PC mode there were more than one participant */
+		if (UseCSNSnapshots && fdwTransState->nparticipants > 1)
+			fdwTransState->two_phase_commit = true;
+
+		/*
+		 * Data change to fast node on slow node, and slow node can see data change.
+		 * In order to implement above we should change code here.
+		 */
+		if (fdwTransState->two_phase_commit)
+		{
+			CSN max_csn = InProgressCSN;
+			CSN my_csn = InProgressCSN;
+			bool res;
+			char *sql;
+
+			fdwTransState->gid = psprintf("pgfdw:%lld:%llu:%d:%u:%d:%d",
+										  (long long) GetCurrentTimestamp(),
+										  (long long) GetSystemIdentifier(),
+										  MyProcPid,
+										  GetCurrentTransactionIdIfAny(),
+										  ++two_phase_xact_count,
+										  fdwTransState->nparticipants);
+
+			/* Broadcast PREPARE */
+			sql = psprintf("PREPARE TRANSACTION '%s'", fdwTransState->gid);
+			res = BroadcastCmd(sql);
+
+			if (IsolationUsesXactSnapshot())
+			{
+				if (!res)
+					goto error;
+
+				/* Broadcast pg_csn_snapshot_prepare() */
+				if (include_local_tx)
+					my_csn = CSNSnapshotPrepareCurrent();
+
+				sql = psprintf("SELECT pg_csn_snapshot_prepare('%s')",
+															fdwTransState->gid);
+				res = BroadcastStmt(sql, PGRES_TUPLES_OK, MaxCsnCB, &max_csn);
+				if (!res)
+					goto error;
+
+				/* select maximal global csn */
+				if (include_local_tx && my_csn > max_csn)
+					max_csn = my_csn;
+
+				/*
+				 * We should always notice local node to update the csn, so local can
+				 * see the change next transaction.
+				 */
+				if (include_local_tx)
+					CSNSnapshotAssignCurrent(max_csn);
+				else
+					/*
+					 * Read-only transactions haven't assigned xid csn. We only
+					 * increase the last csn value.
+					 */
+					GenerateCSN(max_csn);
+
+				sql = psprintf("SELECT pg_csn_snapshot_assign('%s',"UINT64_FORMAT")",
+								fdwTransState->gid, max_csn);
+				res = BroadcastFunc(sql);
+			}
+
+error:
+			if (!res)
+			{
+				sql = psprintf("ABORT PREPARED '%s'", fdwTransState->gid);
+				BroadcastCmd(sql);
+				elog(ERROR, "Failed to PREPARE transaction on remote node");
+			}
+
+			/*
+			 * Do not fall down. Consequent COMMIT event will clean thing up.
+			 */
+			return;
+		}
+	}
+
+	/* COMMIT open transaction of we were doing 2PC */
+	if (fdwTransState->two_phase_commit &&
+		(event == XACT_EVENT_PARALLEL_COMMIT || event == XACT_EVENT_COMMIT))
+	{
+		BroadcastCmd(psprintf("COMMIT PREPARED '%s'", fdwTransState->gid));
+	}
+
 	/*
 	 * Scan all connection cache entries to find open remote transactions, and
 	 * close them.
@@ -808,8 +1053,6 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 	hash_seq_init(&scan, ConnectionHash);
 	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
 	{
-		PGresult   *res;
-
 		/* Ignore cache entry if no open connection right now */
 		if (entry->conn == NULL)
 			continue;
@@ -826,6 +1069,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 			{
 				case XACT_EVENT_PARALLEL_PRE_COMMIT:
 				case XACT_EVENT_PRE_COMMIT:
+					Assert(!fdwTransState->two_phase_commit);
 
 					/*
 					 * If abort cleanup previously failed for this connection,
@@ -838,28 +1082,22 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					do_sql_command(entry->conn, "COMMIT TRANSACTION");
 					entry->changing_xact_state = false;
 
-					/*
-					 * If there were any errors in subtransactions, and we
-					 * made prepared statements, do a DEALLOCATE ALL to make
-					 * sure we get rid of all prepared statements. This is
-					 * annoying and not terribly bulletproof, but it's
-					 * probably not worth trying harder.
-					 *
-					 * DEALLOCATE ALL only exists in 8.3 and later, so this
-					 * constrains how old a server postgres_fdw can
-					 * communicate with.  We intentionally ignore errors in
-					 * the DEALLOCATE, so that we can hobble along to some
-					 * extent with older servers (leaking prepared statements
-					 * as we go; but we don't really support update operations
-					 * pre-8.3 anyway).
-					 */
-					if (entry->have_prep_stmt && entry->have_error)
+					if (UseCSNSnapshots)
 					{
-						res = PQexec(entry->conn, "DEALLOCATE ALL");
-						PQclear(res);
+						CSN			csn = InvalidCSN;
+						PGresult	*res;
+
+						res = pgfdw_exec_query(entry->conn, "SELECT pg_current_csn()");
+						if (PQresultStatus(res) == PGRES_TUPLES_OK)
+						{
+							sscanf(PQgetvalue(res, 0, 0), "%lu", &csn);
+
+							if (csn != InvalidCSN)
+								GenerateCSN(csn);
+						}
 					}
-					entry->have_prep_stmt = false;
-					entry->have_error = false;
+
+					deallocate_prepared_stmts(entry);
 					break;
 				case XACT_EVENT_PRE_PREPARE:
 
@@ -878,6 +1116,11 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 					break;
 				case XACT_EVENT_PARALLEL_COMMIT:
 				case XACT_EVENT_COMMIT:
+					if (fdwTransState->two_phase_commit)
+						deallocate_prepared_stmts(entry);
+					else /* Pre-commit should have closed the open transaction */
+						elog(ERROR, "missed cleaning up connection during pre-commit");
+					break;
 				case XACT_EVENT_PREPARE:
 					/* Pre-commit should have closed the open transaction */
 					elog(ERROR, "missed cleaning up connection during pre-commit");
@@ -975,6 +1218,38 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Reset fdwTransState */
+	memset(fdwTransState, '\0', sizeof(FdwTransactionState));
+}
+
+/*
+ * If there were any errors in subtransactions, and we
+ * made prepared statements, do a DEALLOCATE ALL to make
+ * sure we get rid of all prepared statements. This is
+ * annoying and not terribly bulletproof, but it's
+ * probably not worth trying harder.
+ *
+ * DEALLOCATE ALL only exists in 8.3 and later, so this
+ * constrains how old a server postgres_fdw can
+ * communicate with.  We intentionally ignore errors in
+ * the DEALLOCATE, so that we can hobble along to some
+ * extent with older servers (leaking prepared statements
+ * as we go; but we don't really support update operations
+ * pre-8.3 anyway).
+ */
+static void
+deallocate_prepared_stmts(ConnCacheEntry *entry)
+{
+	PGresult   *res;
+
+	if (entry->have_prep_stmt && entry->have_error)
+	{
+		res = PQexec(entry->conn, "DEALLOCATE ALL");
+		PQclear(res);
+	}
+	entry->have_prep_stmt = false;
+	entry->have_error = false;
 }
 
 /*
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 35b48575c5..f59f78a770 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -312,6 +312,9 @@ typedef struct
 	List	   *already_used;	/* expressions already dealt with */
 } ec_member_foreign_arg;
 
+bool		UseCSNSnapshots;
+void		_PG_init(void);
+
 /*
  * SQL functions
  */
@@ -6813,3 +6816,12 @@ get_batch_size_option(Relation rel)
 
 	return batch_size;
 }
+
+void
+_PG_init(void)
+{
+	DefineCustomBoolVariable("postgres_fdw.use_csn_snapshots",
+							 "Use global snapshots for FDW transactions", NULL,
+							 &UseCSNSnapshots, false, PGC_USERSET, 0, NULL,
+							 NULL, NULL);
+}
\ No newline at end of file
diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h
index 1f67b4d9fd..87fb9bf8c3 100644
--- a/contrib/postgres_fdw/postgres_fdw.h
+++ b/contrib/postgres_fdw/postgres_fdw.h
@@ -211,4 +211,6 @@ extern const char *get_jointype_name(JoinType jointype);
 extern bool is_builtin(Oid objectId);
 extern bool is_shippable(Oid objectId, Oid classId, PgFdwRelationInfo *fpinfo);
 
+extern bool UseCSNSnapshots;
+
 #endif							/* POSTGRES_FDW_H */
diff --git a/contrib/postgres_fdw/t/001_bank_coordinator.pl b/contrib/postgres_fdw/t/001_bank_coordinator.pl
new file mode 100644
index 0000000000..a32637f581
--- /dev/null
+++ b/contrib/postgres_fdw/t/001_bank_coordinator.pl
@@ -0,0 +1,264 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $master = get_new_node("master");
+$master->init;
+$master->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	log_checkpoints = true
+	postgres_fdw.use_csn_snapshots = on
+	enable_csn_snapshot = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$master->start;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	csn_snapshot_defer_time = 15
+	enable_csn_snapshot = on
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	csn_snapshot_defer_time = 15
+	enable_csn_snapshot = on
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+$master->safe_psql('postgres', qq[
+	CREATE EXTENSION postgres_fdw;
+	CREATE TABLE accounts(id integer primary key, amount integer);
+	CREATE TABLE global_transactions(tx_time timestamp);
+]);
+
+foreach my $node ($shard1, $shard2)
+{
+	my $port = $node->port;
+	my $host = $node->host;
+
+	$node->safe_psql('postgres',
+			"CREATE TABLE accounts(id integer primary key, amount integer)");
+
+	$master->safe_psql('postgres', qq[
+		CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw options(dbname 'postgres', host '$host', port '$port');
+		CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts) server shard_$port options(table_name 'accounts');
+		CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+	])
+}
+
+$shard1->safe_psql('postgres', qq[
+	insert into accounts select 2*id-1, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+$shard2->safe_psql('postgres', qq[
+	insert into accounts select 2*id, 0 from generate_series(1, 10010) as id;
+	CREATE TABLE local_transactions(tx_time timestamp);
+]);
+
+diag("master: @{[$master->connstr('postgres')]}");
+diag("shard1: @{[$shard1->connstr('postgres')]}");
+diag("shard2: @{[$shard2->connstr('postgres')]}");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+my $bank1 = File::Temp->new();
+append_to_file($bank1, q{
+	\set id random(1, 10000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = (2*:id + 1) RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 3);
+	COMMIT;
+});
+
+my $bank2 = File::Temp->new();
+append_to_file($bank2, q{
+	\set id random(1, 10000)
+
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = 2*:id RETURNING *)
+		INSERT into local_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (2*:id + 2);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+
+
+my $pgb_handle;
+
+$pgb_handle = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+$master->pgbench_await($pgb_handle);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# Concurrent global and local transactions
+###############################################################################
+
+my ($pgb_handle1, $pgb_handle2, $pgb_handle3);
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$started = time();
+$selects = 0;
+$oldtotal = 0;
+while (time() - $started < $seconds)
+{
+	$total = $master->safe_psql('postgres', "select sum(amount) from accounts");
+	if ( ($total ne $oldtotal) and ($total ne '') )
+	{
+		$isolation_errors++;
+		$oldtotal = $total;
+		diag("Isolation error. Total = $total");
+	}
+	if ($total ne '') { $selects++; }
+}
+
+diag("selects = $selects");
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+diag("completed $selects selects");
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global and local transactions');
+
+
+###############################################################################
+# Snapshot stability
+###############################################################################
+
+my ($hashes, $hash1, $hash2);
+my $stability_errors = 0;
+
+# global txses
+$pgb_handle1 = $master->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+# concurrent local
+$pgb_handle2 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank1, 'postgres' );
+$pgb_handle3 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank2, 'postgres' );
+
+$selects = 0;
+$started = time();
+while (time() - $started < $seconds)
+{
+	foreach my $node ($master, $shard1, $shard2)
+	{
+		($hash1, $_, $hash2) = split "\n", $node->safe_psql('postgres', qq[
+			begin isolation level repeatable read;
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			select pg_sleep(3);
+			select md5(array_agg((t.*)::text)::text) from (select * from accounts order by id) as t;
+			commit;
+		]);
+
+		if ($hash1 ne $hash2)
+		{
+			diag("oops");
+			$stability_errors++;
+		}
+		elsif ($hash1 eq '' or $hash2 eq '')
+		{
+			die;
+		}
+		else
+		{
+			$selects++;
+		}
+	}
+}
+
+$master->pgbench_await($pgb_handle1);
+$shard1->pgbench_await($pgb_handle2);
+$shard2->pgbench_await($pgb_handle3);
+
+die "" unless ( $selects > 0 &&
+	count_and_delete_rows($master, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard1, 'local_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'local_transactions') > 0);
+
+is($stability_errors, 0, 'snapshot is stable during concurrent global and local transactions');
+
+$master->stop;
+$shard1->stop;
+$shard2->stop;
diff --git a/contrib/postgres_fdw/t/002_bank_participant.pl b/contrib/postgres_fdw/t/002_bank_participant.pl
new file mode 100644
index 0000000000..e5a5a721c6
--- /dev/null
+++ b/contrib/postgres_fdw/t/002_bank_participant.pl
@@ -0,0 +1,240 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+my $shard1 = get_new_node("shard1");
+$shard1->init;
+$shard1->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_csn_snapshots = on
+	csn_snapshot_defer_time = 15
+	enable_csn_snapshot = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard1->start;
+
+my $shard2 = get_new_node("shard2");
+$shard2->init;
+$shard2->append_conf('postgresql.conf', qq(
+	max_prepared_transactions = 30
+	postgres_fdw.use_csn_snapshots = on
+	csn_snapshot_defer_time = 15
+	enable_csn_snapshot = on
+	default_transaction_isolation = 'REPEATABLE READ'
+));
+$shard2->start;
+
+###############################################################################
+# Prepare nodes
+###############################################################################
+
+my @shards = ($shard1, $shard2);
+
+foreach my $node (@shards)
+{
+	$node->safe_psql('postgres', qq[
+		CREATE EXTENSION postgres_fdw;
+		CREATE TABLE accounts(id integer primary key, amount integer);
+		CREATE TABLE accounts_local() inherits(accounts);
+		CREATE TABLE global_transactions(tx_time timestamp);
+		CREATE TABLE local_transactions(tx_time timestamp);
+	]);
+
+	foreach my $neighbor (@shards)
+	{
+		next if ($neighbor eq $node);
+
+		my $port = $neighbor->port;
+		my $host = $neighbor->host;
+
+		$node->safe_psql('postgres', qq[
+			CREATE SERVER shard_$port FOREIGN DATA WRAPPER postgres_fdw
+					options(dbname 'postgres', host '$host', port '$port');
+			CREATE FOREIGN TABLE accounts_fdw_$port() inherits (accounts)
+					server shard_$port options(table_name 'accounts_local');
+			CREATE USER MAPPING for CURRENT_USER SERVER shard_$port;
+		]);
+	}
+}
+
+$shard1->psql('postgres', "insert into accounts_local select 2*id-1, 0 from generate_series(1, 10010) as id;");
+$shard2->psql('postgres', "insert into accounts_local select 2*id,   0 from generate_series(1, 10010) as id;");
+
+###############################################################################
+# pgbench scripts
+###############################################################################
+
+my $bank = File::Temp->new();
+append_to_file($bank, q{
+	\set id random(1, 20000)
+	BEGIN;
+	WITH upd AS (UPDATE accounts SET amount = amount - 1 WHERE id = :id RETURNING *)
+		INSERT into global_transactions SELECT now() FROM upd;
+	UPDATE accounts SET amount = amount + 1 WHERE id = (:id + 1);
+	COMMIT;
+});
+
+###############################################################################
+# Helpers
+###############################################################################
+
+sub count_and_delete_rows
+{
+	my ($node, $table) = @_;
+	my $count;
+
+	$count = $node->safe_psql('postgres',"select count(*) from $table");
+	$node->safe_psql('postgres',"delete from $table");
+	diag($node->name, ": completed $count transactions");
+	return $count;
+}
+
+###############################################################################
+# Concurrent global transactions
+###############################################################################
+
+my ($err, $rc);
+my $started;
+my $seconds = 30;
+my $selects;
+my $total = '0';
+my $oldtotal = '0';
+my $isolation_errors = 0;
+my $i;
+
+
+my ($pgb_handle1, $pgb_handle2);
+
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction');
+
+###############################################################################
+# And do the same after soft restart
+###############################################################################
+
+$shard1->restart;
+$shard2->restart;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after restart');
+
+###############################################################################
+# And do the same after hard restart
+###############################################################################
+
+$shard1->teardown_node;
+$shard2->teardown_node;
+$shard1->start;
+$shard2->start;
+$shard1->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard1 to became online";
+$shard2->poll_query_until('postgres', "select 't'")
+	or die "Timed out waiting for shard2 to became online";
+
+
+$seconds = 15;
+$pgb_handle1 = $shard1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+$pgb_handle2 = $shard2->pgbench_async(-n, -c => 5, -T => $seconds, -f => $bank, 'postgres' );
+
+$started = time();
+$selects = 0;
+$i = 0;
+
+while (time() - $started < $seconds)
+{
+	my $shard = $shard1;
+	foreach my $shard (@shards)
+	{
+		$total = $shard->safe_psql('postgres', "select sum(amount) from accounts");
+		if ( ($total ne $oldtotal) and ($total ne '') )
+		{
+			$isolation_errors++;
+			$oldtotal = $total;
+			diag("$i: Isolation error. Total = $total");
+		}
+		if ($total ne '') { $selects++; }
+	}
+	$i++;
+}
+
+$shard1->pgbench_await($pgb_handle1);
+$shard2->pgbench_await($pgb_handle2);
+
+# sanity check
+diag("completed $selects selects");
+die "no actual transactions happend" unless ( $selects > 0 &&
+	count_and_delete_rows($shard1, 'global_transactions') > 0 &&
+	count_and_delete_rows($shard2, 'global_transactions') > 0);
+
+is($isolation_errors, 0, 'isolation between concurrent global transaction after hard restart');
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index deabedbe37..d375eacc8f 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2103,6 +2103,13 @@ StartTransaction(void)
 	ShowTransactionState("StartTransaction");
 }
 
+Datum
+pg_current_csn(PG_FUNCTION_ARGS)
+{
+	CSN	csn = GenerateCSN(InvalidCSN);
+
+	PG_RETURN_INT64(csn);
+}
 
 /*
  *	CommitTransaction
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 8585464c5b..ed776e5166 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11424,5 +11424,8 @@
 { oid => '10004', descr => 'assign csn to distributed transaction',
   proname => 'pg_csn_snapshot_assign', provolatile => 'v', proparallel => 'u',
   prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_csn_snapshot_assign' },
+{ oid => '10005', descr => 'get current CSN',
+  proname => 'pg_current_csn', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_current_csn' },
 
 ]
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 97e05993be..ea251be0b7 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -2243,6 +2243,41 @@ sub pg_recvlogical_upto
 	}
 }
 
+sub pgbench()
+{
+	my ($self, $node, @args) = @_;
+	my $pgbench_handle = $self->pgbench_async($node, @args);
+	$self->pgbench_await($pgbench_handle);
+}
+
+sub pgbench_async()
+{
+	my ($self, @args) = @_;
+
+	my ($in, $out, $err, $rc);
+	$in = '';
+	$out = '';
+
+	my @pgbench_command = (
+		'pgbench',
+		-h => $self->host,
+		-p => $self->port,
+		@args
+	);
+	my $handle = IPC::Run::start(\@pgbench_command, $in, $out);
+	return $handle;
+}
+
+sub pgbench_await()
+{
+	my ($self, $pgbench_handle) = @_;
+
+	# During run some pgbench threads can exit (for example due to
+	# serialization error). That will set non-zero returning code.
+	# So don't check return code here and leave it to a caller.
+	my $rc = IPC::Run::finish($pgbench_handle);
+}
+
 =pod
 
 =back
-- 
2.25.1

#54

tsunakawa.takay@fujitsu.com

almost 5 years ago

In reply to: Andrey V. Lepikhov (#53)

RE: Global snapshots

From: Andrey V. Lepikhov <a.lepikhov@postgrespro.ru>

Current state of the patch set rebased on master, 5aed6a1fc2.

It is development version. Here some problems with visibility still detected in
two tests:
1. CSN Snapshot module - TAP test on time skew.
2. Clock SI implementation - TAP test on emulation of bank transaction.

I'm sorry to be late to respond. Thank you for the update.

As discussed at the HighGo meeting, what do you think we should do about this patch set, now that we agreed that Clock-SI is covered by Microsoft's patent? I'd appreciate it if you could share some idea to change part of the algorithm and circumvent the patent.

Otherwise, why don't we discuss alternatives, such as the Commitment Ordering?

I have a hunch that YugabyteDB's method seems promising, which I wrote in the following wiki. Of course, we should make efforts to see if it's patented before diving deeper into the design or implementation.

Scaleout Design - PostgreSQL wiki
https://wiki.postgresql.org/wiki/Scaleout_Design

Regards
Takayuki Tsunakawa

#55

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

about 4 years ago

In reply to: tsunakawa.takay@fujitsu.com (#54)

1 attachment(s)

Next version of CSN implementation in snapshots to achieve a proper
snapshot isolation in the case of a cross-instance distributed transaction.

--
regards,
Andrey Lepikhov
Postgres Professional

Attachments:

0001-Add-Commit-Sequence-Number-CSN-machinery-into-MVCC.patchtext/x-patch; charset=UTF-8; name=0001-Add-Commit-Sequence-Number-CSN-machinery-into-MVCC.patchDownload

From bbb7dd1d7621c091f11e697d3d894fe7a36918a6 Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Wed, 17 Nov 2021 11:13:37 +0500
Subject: [PATCH] Add Commit Sequence Number (CSN) machinery into MVCC
 implementation for a timestamp-based resolving of visibility conflicts.

It allows to achieve proper snapshot isolation semantics in the case
of distributed transactions involving more than one Postgres instance.

Authors: K.Knizhnik, S.Kelvich, A.Sher, A.Lepikhov, M.Usama.

Discussion:
(2020/05/21 -)
https://www.postgresql.org/message-id/flat/CA%2Bfd4k6HE8xLGEvqWzABEg8kkju5MxU%2Bif7bf-md0_2pjzXp9Q%40mail.gmail.com#ed1359340871688bed2e643921f73365
(2018/05/01 - 2019/04/21)
https://www.postgresql.org/message-id/flat/21BC916B-80A1-43BF-8650-3363CCDAE09C%40postgrespro.ru
---
 doc/src/sgml/config.sgml                      |  50 +-
 src/backend/access/rmgrdesc/Makefile          |   1 +
 src/backend/access/rmgrdesc/csnlogdesc.c      |  95 +++
 src/backend/access/rmgrdesc/xlogdesc.c        |   6 +-
 src/backend/access/transam/Makefile           |   2 +
 src/backend/access/transam/csn_log.c          | 748 ++++++++++++++++++
 src/backend/access/transam/csn_snapshot.c     | 687 ++++++++++++++++
 src/backend/access/transam/rmgr.c             |   1 +
 src/backend/access/transam/twophase.c         | 154 ++++
 src/backend/access/transam/varsup.c           |   2 +
 src/backend/access/transam/xact.c             |  32 +
 src/backend/access/transam/xlog.c             |  23 +-
 src/backend/access/transam/xloginsert.c       |   2 +
 src/backend/commands/vacuum.c                 |   3 +-
 src/backend/storage/ipc/ipci.c                |   6 +
 src/backend/storage/ipc/procarray.c           |  85 ++
 src/backend/storage/lmgr/lwlock.c             |   2 +
 src/backend/storage/lmgr/lwlocknames.txt      |   2 +
 src/backend/storage/lmgr/proc.c               |   6 +
 src/backend/storage/sync/sync.c               |   5 +
 src/backend/utils/misc/guc.c                  |  37 +
 src/backend/utils/probes.d                    |   2 +
 src/backend/utils/time/snapmgr.c              | 149 +++-
 src/bin/initdb/initdb.c                       |   3 +-
 src/bin/pg_controldata/pg_controldata.c       |   2 +
 src/bin/pg_upgrade/pg_upgrade.c               |   5 +
 src/bin/pg_upgrade/pg_upgrade.h               |   2 +
 src/bin/pg_waldump/rmgrdesc.c                 |   1 +
 src/include/access/csn_log.h                  |  98 +++
 src/include/access/csn_snapshot.h             |  54 ++
 src/include/access/rmgrlist.h                 |   1 +
 src/include/access/xlog_internal.h            |   2 +
 src/include/catalog/pg_control.h              |   1 +
 src/include/catalog/pg_proc.dat               |  17 +
 src/include/datatype/timestamp.h              |   3 +
 src/include/fmgr.h                            |   1 +
 src/include/portability/instr_time.h          |  10 +
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |  14 +
 src/include/storage/procarray.h               |   7 +
 src/include/storage/sync.h                    |   1 +
 src/include/utils/snapmgr.h                   |   7 +-
 src/include/utils/snapshot.h                  |  11 +
 src/test/modules/Makefile                     |   1 +
 src/test/modules/csnsnapshot/Makefile         |  25 +
 .../modules/csnsnapshot/csn_snapshot.conf     |   1 +
 .../csnsnapshot/expected/csnsnapshot.out      |   1 +
 src/test/modules/csnsnapshot/t/001_base.pl    | 103 +++
 src/test/modules/csnsnapshot/t/002_standby.pl |  66 ++
 .../modules/csnsnapshot/t/003_time_skew.pl    | 214 +++++
 .../csnsnapshot/t/004_read_committed.pl       |  97 +++
 .../csnsnapshot/t/005_basic_visibility.pl     | 181 +++++
 src/test/modules/snapshot_too_old/sto.conf    |   1 +
 src/test/regress/expected/sysviews.out        |   4 +-
 54 files changed, 3024 insertions(+), 11 deletions(-)
 create mode 100644 src/backend/access/rmgrdesc/csnlogdesc.c
 create mode 100644 src/backend/access/transam/csn_log.c
 create mode 100644 src/backend/access/transam/csn_snapshot.c
 create mode 100644 src/include/access/csn_log.h
 create mode 100644 src/include/access/csn_snapshot.h
 create mode 100644 src/test/modules/csnsnapshot/Makefile
 create mode 100644 src/test/modules/csnsnapshot/csn_snapshot.conf
 create mode 100644 src/test/modules/csnsnapshot/expected/csnsnapshot.out
 create mode 100644 src/test/modules/csnsnapshot/t/001_base.pl
 create mode 100644 src/test/modules/csnsnapshot/t/002_standby.pl
 create mode 100644 src/test/modules/csnsnapshot/t/003_time_skew.pl
 create mode 100644 src/test/modules/csnsnapshot/t/004_read_committed.pl
 create mode 100644 src/test/modules/csnsnapshot/t/005_basic_visibility.pl

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 3f806740d5..f4f6c83fd0 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9682,8 +9682,56 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
      </varlistentry>
 
      </variablelist>
-   </sect1>
 
+    <sect2 id="runtime-config-CSN-based-snapshot">
+     <title>CSN Based Snapshot</title>
+      <para>
+       By default, snapshots in <productname>PostgreSQL</productname> contains a
+       XID (TransactionID) that allows to identify the status of a transaction
+       and make arbitrary visibility calculations.
+      </para>
+
+      <para>
+       <productname>PostgreSQL</productname> also provides a CSN (Commit
+        Sequence Number) based machinery as an additional tool for visibility
+        calculations. It may be used within distributed transactions when a xid of
+        a local transaction can't correctly identify order of the distributed one.
+      </para>
+
+     <variablelist>
+      <varlistentry id="guc-enable-csn-snapshot" xreflabel="enable_csn_snapshot">
+       <term><varname>enable_csn_snapshot</varname> (<type>boolean</type>)
+        <indexterm>
+         <primary><varname>enable_csn_snapshot</varname> configuration parameter</primary>
+        </indexterm>
+       </term>
+       <listitem>
+
+        <para>
+         Enable/disable the CSN tracking for the snapshot.
+        </para>
+
+        <para>
+        <productname>PostgreSQL</productname> uses a physical clock timestamp as
+        a CSN, so enabling the CSN based snapshots can be useful for implementing
+        cross-instance snapshots and visibility of distributed transaction.
+        </para>
+
+        <para>
+         when enabled <productname>PostgreSQL</productname> creates
+         <filename>pg_csn</filename> directory under <envar>PGDATA</envar> to keep
+         the track of CSN and XID mappings.
+        </para>
+
+        <para>
+         The default value is on.
+        </para>
+       </listitem>
+      </varlistentry>
+
+     </variablelist>
+    </sect2>
+   </sect1>
    <sect1 id="runtime-config-compatible">
     <title>Version and Platform Compatibility</title>
 
diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index f88d72fd86..15fc36f7b4 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -11,6 +11,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	brindesc.o \
 	clogdesc.o \
+	csnlogdesc.o \
 	committsdesc.o \
 	dbasedesc.o \
 	genericdesc.o \
diff --git a/src/backend/access/rmgrdesc/csnlogdesc.c b/src/backend/access/rmgrdesc/csnlogdesc.c
new file mode 100644
index 0000000000..f8c644e906
--- /dev/null
+++ b/src/backend/access/rmgrdesc/csnlogdesc.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * clogdesc.c
+ *	  rmgr descriptor routines for access/transam/csn_log.c
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/rmgrdesc/csnlogdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+
+
+void
+csnlog_desc(StringInfo buf, XLogReaderState *record)
+{
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		appendStringInfo(buf, "assign "INT64_FORMAT"", csn);
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) rec;
+		int			  nsubxids;
+
+		appendStringInfo(buf, "set "INT64_FORMAT" for: %u",
+						 xlrec->csn,
+						 xlrec->xtop);
+		nsubxids = ((XLogRecGetDataLen(record) - MinSizeOfCSNSet) /
+					sizeof(TransactionId));
+		if (nsubxids > 0)
+		{
+			int			i;
+			TransactionId *subxids;
+
+			subxids = palloc(sizeof(TransactionId) * nsubxids);
+			memcpy(subxids,
+				   XLogRecGetData(record) + MinSizeOfCSNSet,
+				   sizeof(TransactionId) * nsubxids);
+			for (i = 0; i < nsubxids; i++)
+				appendStringInfo(buf, ", %u", subxids[i]);
+			pfree(subxids);
+		}
+	}
+}
+
+const char *
+csnlog_identify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_CSN_ASSIGNMENT:
+			id = "ASSIGNMENT";
+			break;
+		case XLOG_CSN_SETCSN:
+			id = "SETCSN";
+			break;
+		case XLOG_CSN_ZEROPAGE:
+			id = "ZEROPAGE";
+			break;
+		case XLOG_CSN_TRUNCATE:
+			id = "TRUNCATE";
+			break;
+	}
+
+	return id;
+}
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 5bf2346dd9..ea433046cf 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -113,7 +113,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		appendStringInfo(buf, "max_connections=%d max_worker_processes=%d "
 						 "max_wal_senders=%d max_prepared_xacts=%d "
 						 "max_locks_per_xact=%d wal_level=%s "
-						 "wal_log_hints=%s track_commit_timestamp=%s",
+						 "wal_log_hints=%s track_commit_timestamp=%s "
+						 "enable_csn_snapshot=%s",
 						 xlrec.MaxConnections,
 						 xlrec.max_worker_processes,
 						 xlrec.max_wal_senders,
@@ -121,7 +122,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 xlrec.max_locks_per_xact,
 						 wal_level_str,
 						 xlrec.wal_log_hints ? "on" : "off",
-						 xlrec.track_commit_timestamp ? "on" : "off");
+						 xlrec.track_commit_timestamp ? "on" : "off",
+						 xlrec.enable_csn_snapshot ? "on" : "off");
 	}
 	else if (info == XLOG_FPW_CHANGE)
 	{
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de72..fc0321ee6b 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,8 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	csn_log.o \
+	csn_snapshot.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c
new file mode 100644
index 0000000000..33517271ed
--- /dev/null
+++ b/src/backend/access/transam/csn_log.c
@@ -0,0 +1,748 @@
+/*-----------------------------------------------------------------------------
+ *
+ * csn_log.c
+ *		Track commit sequence numbers of finished transactions
+ *
+ * This module provides SLRU to store CSN for each transaction.  This
+ * mapping need to be kept only for xid's greater then oldestXid, but
+ * that can require arbitrary large amounts of memory in case of long-lived
+ * transactions.  Because of same lifetime and persistancy requirements
+ * this module is quite similar to subtrans.c
+ *
+ * If we switch database from CSN-base snapshot to xid-base snapshot then,
+ * nothing wrong. But if we switch xid-base snapshot to CSN-base snapshot
+ * it should decide a new xid which begin csn-base check. It can not be
+ * oldestActiveXID because of prepared transaction.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+#include "access/slru.h"
+#include "access/csn_snapshot.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "portability/instr_time.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/snapmgr.h"
+#include "access/xlog_internal.h"
+
+/*
+ * We use csnSnapshotActive to judge if csn snapshot enabled instead of by
+ * enable_csn_snapshot, this design is similar to 'track_commit_timestamp'.
+ *
+ * Because in process of replication if master changes 'enable_csn_snapshot'
+ * in a database restart, standby should apply wal record for GUC changed,
+ * then it's difficult to notice all backends about that. So they can get
+ * the message by 'csnSnapshotActive' which in shared buffer. It will not
+ * acquire a lock, so without performance issue.
+ * last_max_csn - Record the max csn till now.
+ * last_csn_log_wal - for interval we log the assign csn to wal
+ * oldestXmin - first sensible Xmin on the first existed page in the CSN Log
+ */
+typedef struct CSNShared
+{
+	bool				csnSnapshotActive;
+	pg_atomic_uint32	oldestXmin;
+	CSN					last_max_csn;
+	CSN					last_csn_log_wal;
+	volatile slock_t	lock;
+} CSNShared;
+
+CSNShared *csnShared;
+
+/*
+ * Defines for CSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/CSN_LOG_XACTS_PER_PAGE, and CSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCSNLog (see CSNLogPagePrecedes).
+ */
+
+/* We store the commit CSN for each xid */
+#define CSN_LOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CSN))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData CSNLogCtlData;
+#define CsnlogCtl (&CSNLogCtlData)
+
+static int	ZeroCSNLogPage(int pageno, bool write_xlog);
+static void ZeroTruncateCSNLogPage(int pageno, bool write_xlog);
+static bool CSNLogPagePrecedes(int page1, int page2);
+static void CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+									  TransactionId *subxids,
+									  CSN csn, int pageno);
+static void CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno);
+
+static void WriteCSNXlogRec(TransactionId xid, int nsubxids,
+							TransactionId *subxids, CSN csn);
+static void WriteZeroCSNPageXlogRec(int pageno);
+static void WriteTruncateCSNXlogRec(int pageno);
+static void set_oldest_xmin(TransactionId xid);
+
+
+/*
+ * Number of shared CSNLog buffers.
+ */
+static Size
+CSNLogShmemBuffers(void)
+{
+	return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for CsnlogCtl.
+ */
+Size
+CSNLogShmemSize(void)
+{
+	return SimpleLruShmemSize(CSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for CSNLog.
+ */
+void
+CSNLogShmemInit(void)
+{
+	bool		found;
+
+	CsnlogCtl->PagePrecedes = CSNLogPagePrecedes;
+	SimpleLruInit(CsnlogCtl, "CSNLog Ctl", CSNLogShmemBuffers(), 0,
+				  CSNLogSLRULock, "pg_csn", LWTRANCHE_CSN_LOG_BUFFERS,
+				  SYNC_HANDLER_CSN);
+
+	csnShared = ShmemInitStruct("CSNlog shared",
+									 sizeof(CSNShared),
+									 &found);
+	if (!found)
+	{
+		csnShared->csnSnapshotActive = false;
+		pg_atomic_init_u32(&csnShared->oldestXmin, InvalidTransactionId);
+		csnShared->last_max_csn = InvalidCSN;
+		csnShared->last_csn_log_wal = InvalidCSN;
+		SpinLockInit(&csnShared->lock);
+	}
+}
+
+/*
+ * CSNLogSetCSN
+ *
+ * Record CSN of transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transactionid for a top level commit or abort. It can also be a
+ * subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * AbortedCSN for abort cases.
+ */
+void
+CSNLogSetCSN(TransactionId xid, int nsubxids, TransactionId *subxids, CSN csn,
+			 bool write_xlog)
+{
+	int pageno;
+	int i = 0;
+	int offset = 0;
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);		/* get page of parent */
+
+	if(write_xlog)
+		WriteCSNXlogRec(xid, nsubxids, subxids, csn);
+
+	for (;;)
+	{
+		int num_on_page = 0;
+
+		/* Form subtransactions bucket that can be written on the same page */
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		CSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							csn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+CSNLogSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids,
+					CSN csn, int pageno)
+{
+	int slotno;
+	int i;
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		CSNLogSetCSNInSlot(subxids[i], csn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		CSNLogSetCSNInSlot(xid, csn, slotno);
+
+	CsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(CSNLogSLRULock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno)
+{
+	int entryno = TransactionIdToPgIndex(xid);
+	CSN *ptr;
+
+	Assert(LWLockHeldByMe(CSNLogSLRULock));
+
+	ptr = (CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetCSN() in csn_snapshot.c is the
+ * intended caller.
+ */
+CSN
+CSNLogGetCSNByXid(TransactionId xid)
+{
+	int pageno = TransactionIdToPage(xid);
+	int entryno = TransactionIdToPgIndex(xid);
+	int slotno;
+	CSN csn;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid);
+	csn = *(CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+	LWLockRelease(CSNLogSLRULock);
+
+	return csn;
+}
+
+/*
+ * Initialize (or reinitialize) a page of CSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCSNLogPage(int pageno, bool write_xlog)
+{
+	Assert(LWLockHeldByMe(CSNLogSLRULock));
+	if(write_xlog)
+		WriteZeroCSNPageXlogRec(pageno);
+	return SimpleLruZeroPage(CsnlogCtl, pageno);
+}
+
+static void
+ZeroTruncateCSNLogPage(int pageno, bool write_xlog)
+{
+	if(write_xlog)
+		WriteTruncateCSNXlogRec(pageno);
+	SimpleLruTruncate(CsnlogCtl, pageno);
+}
+
+void
+ActivateCSNlog(void)
+{
+	int				pageno;
+	TransactionId	nextXid = InvalidTransactionId;
+	TransactionId	oldest_xid = InvalidTransactionId;
+
+	if (csnShared->csnSnapshotActive)
+		return;
+
+	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	pageno = TransactionIdToPage(nextXid);
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * Create the current segment file, if necessary.
+	 * This means that
+	 */
+	if (!SimpleLruDoesPhysicalPageExist(CsnlogCtl, pageno))
+	{
+		int slotno;
+		TransactionId curxid = nextXid;
+
+		slotno = ZeroCSNLogPage(pageno, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+
+		elog(LOG, "Create SLRU page=%d, slotno=%d for xid %u on a CSN log activation",
+			 pageno, slotno, nextXid);
+
+		/*
+		 * nextXid isn't first xid on the page. It is the first page in the CSN
+		 * log. Set UnclearCSN value into all previous slots on this page.
+		 * This xid value can be used as an oldest xid in the CSN log.
+		 */
+		if (TransactionIdToPgIndex(nextXid) > 0)
+		{
+			/* Cleaning procedure. Can be optimized. */
+			do
+			{
+				curxid--;
+				CSNLogSetCSNInSlot(curxid, UnclearCSN, slotno);
+			} while (TransactionIdToPgIndex(curxid) > 0);
+
+			elog(LOG,
+				 "Set UnclearCSN values for %d xids in the range [%u,%u]",
+				 nextXid - curxid, curxid, nextXid-1);
+
+			/* Oldest XID found on this page */
+			oldest_xid = nextXid;
+		}
+	}
+	LWLockRelease(CSNLogSLRULock);
+
+	if (!TransactionIdIsValid(oldest_xid))
+	{
+		TransactionId curxid;
+
+		elog(LOG, "Search for the oldest xid across previous pages");
+
+		/* Need to scan previous pages for an oldest xid. */
+		while (pageno > 0 && SimpleLruDoesPhysicalPageExist(CsnlogCtl, pageno - 1))
+			pageno--;
+
+		/* look up for the first clear xid value. */
+		curxid = pageno * (TransactionId) CSN_LOG_XACTS_PER_PAGE;
+		while(CSNLogGetCSNByXid(curxid) == UnclearCSN)
+			curxid++;
+		oldest_xid = curxid;
+	}
+
+	set_oldest_xmin(oldest_xid);
+	csnShared->csnSnapshotActive = true;
+}
+
+bool
+get_csnlog_status(void)
+{
+	return csnShared->csnSnapshotActive;
+}
+
+void
+DeactivateCSNlog(void)
+{
+	csnShared->csnSnapshotActive = false;
+	set_oldest_xmin(InvalidTransactionId);
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+	(void) SlruScanDirectory(CsnlogCtl, SlruScanDirCbDeleteAll, NULL);
+	LWLockRelease(CSNLogSLRULock);
+	elog(LOG, "CSN log has deactivated");
+}
+
+void
+StartupCSN(void)
+{
+	ActivateCSNlog();
+}
+
+void
+CompleteCSNInitialization(void)
+{
+	/*
+	 * If the feature is not enabled, turn it off for good.  This also removes
+	 * any leftover data.
+	 *
+	 * Conversely, we activate the module if the feature is enabled.  This is
+	 * necessary for primary and standby as the activation depends on the
+	 * control file contents at the beginning of recovery or when a
+	 * XLOG_PARAMETER_CHANGE is replayed.
+	 */
+	if (!enable_csn_snapshot)
+		DeactivateCSNlog();
+	else
+		ActivateCSNlog();
+}
+
+void
+CSNlogParameterChange(bool newvalue, bool oldvalue)
+{
+	if (newvalue)
+	{
+		if (!csnShared->csnSnapshotActive)
+			ActivateCSNlog();
+	}
+	else if (csnShared->csnSnapshotActive)
+		DeactivateCSNlog();
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCSNLog(void)
+{
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * Flush dirty CSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true);
+	SimpleLruWriteAll(CsnlogCtl, true);
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that CSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCSNLog(TransactionId newestXact)
+{
+	int			pageno;
+
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCSNLogPage(pageno, !InRecovery);
+
+	LWLockRelease(CSNLogSLRULock);
+}
+
+/*
+ * Remove all CSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateCSNLog(TransactionId oldestXact)
+{
+	int				cutoffPage;
+	TransactionId	oldestXmin;
+
+	/* Can't do truncation because WAL messages isn't allowed during recovery */
+	if (RecoveryInProgress() || !get_csnlog_status())
+		return;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	/* Detect, that we really need to cut CSN log. */
+	oldestXmin = pg_atomic_read_u32(&csnShared->oldestXmin);
+
+	if (TransactionIdToPage(oldestXmin) < cutoffPage)
+	{
+		/* OldestXact is located in the same page as oldestXmin. No actions needed. */
+		return;
+	}
+
+	/*
+	 * Shift oldestXmin to the start of new first page. Use first position
+	 * on the page because all transactions on this page is created with enabled
+	 * CSN snapshot machinery.
+	 */
+	pg_atomic_write_u32(&csnShared->oldestXmin,
+						oldestXact - TransactionIdToPgIndex(oldestXact));
+
+	SpinLockRelease(&csnShared->lock);
+	ZeroTruncateCSNLogPage(cutoffPage, true);
+}
+
+/*
+ * Decide which of two CSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+CSNLogPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * CSN_LOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * CSN_LOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
+
+void
+WriteAssignCSNXlogRec(CSN csn)
+{
+	Assert(enable_csn_wal && csn <= csnShared->last_csn_log_wal);
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&csn), sizeof(CSN));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ASSIGNMENT);
+}
+
+static void
+WriteCSNXlogRec(TransactionId xid, int nsubxids,
+				TransactionId *subxids, CSN csn)
+{
+	xl_csn_set xlrec;
+
+	if(!enable_csn_wal)
+		return;
+
+	xlrec.xtop = xid;
+	xlrec.nsubxacts = nsubxids;
+	xlrec.csn = csn;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, MinSizeOfCSNSet);
+	XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_SETCSN);
+}
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroCSNPageXlogRec(int pageno)
+{
+	if(!enable_csn_wal)
+	{
+		return;
+	}
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ */
+static void
+WriteTruncateCSNXlogRec(int pageno)
+{
+	if(!enable_csn_wal)
+	{
+		return;
+	}
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_TRUNCATE);
+}
+
+
+void
+csnlog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in csnlog records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		/* XXX: Do we really not needed to acquire the lock here? */
+		csnShared->last_max_csn = csn;
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) XLogRecGetData(record);
+		CSNLogSetCSN(xlrec->xtop, xlrec->nsubxacts, xlrec->xsub, xlrec->csn, false);
+	}
+	else if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+		slotno = ZeroCSNLogPage(pageno, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+		LWLockRelease(CSNLogSLRULock);
+		Assert(!CsnlogCtl->shared->page_dirty[slotno]);
+
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		CsnlogCtl->shared->latest_page_number = pageno;
+		ZeroTruncateCSNLogPage(pageno, false);
+	}
+	else
+		elog(PANIC, "csnlog_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+csnsyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(&CSNLogCtlData, ftag, path);
+}
+
+/*
+ * GenerateCSN
+ *
+ * Generate CSN which is actually a local time. Also we are forcing
+ * this time to be always increasing. Since now it is not uncommon to have
+ * millions of read transactions per second we are trying to use nanoseconds
+ * if such time resolution is available.
+ */
+CSN
+GenerateCSN(bool locked, CSN assign)
+{
+	instr_time	current_time;
+	CSN	csn;
+	CSN log_csn = InvalidCSN;
+
+	Assert(get_csnlog_status() || csn_snapshot_defer_time > 0);
+
+	/* TODO: create some macro that add small random shift to current time. */
+	INSTR_TIME_SET_CURRENT(current_time);
+	csn = (CSN) INSTR_TIME_GET_NANOSEC(current_time) + (int64) (csn_time_shift * 1E9);
+
+	if(assign != InvalidCSN && csn < assign)
+		csn = assign;
+
+	/* TODO: change to atomics? */
+	if (!locked)
+		SpinLockAcquire(&csnShared->lock);
+
+	if (csn <= csnShared->last_max_csn)
+		csn = csnShared->last_max_csn + 1;
+	csnShared->last_max_csn = csn;
+
+	if (enable_csn_wal && csn > csnShared->last_csn_log_wal)
+	{
+		/*
+		 * We log the CSN 5s greater than generated, you can see comments on
+		 * the CSN_ASSIGN_TIME_INTERVAL.
+		 */
+		log_csn = CSNAddByNanosec(csn, CSN_ASSIGN_TIME_INTERVAL);
+		csnShared->last_csn_log_wal = log_csn;
+	}
+
+	if (!locked)
+		SpinLockRelease(&csnShared->lock);
+
+	if (log_csn != InvalidCSN)
+		WriteAssignCSNXlogRec(csn);
+
+	return csn;
+}
+
+CSN
+GetLastGeneratedCSN(void)
+{
+	CSN csn;
+
+	SpinLockAcquire(&csnShared->lock);
+	csn = csnShared->last_max_csn;
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+/*
+ * Mostly for debug purposes.
+ */
+static void
+set_oldest_xmin(TransactionId xid)
+{
+	elog(LOG, "Oldest Xmin for CSN will be changed from %u to %u",
+		 pg_atomic_read_u32(&csnShared->oldestXmin), xid);
+
+	pg_atomic_write_u32(&csnShared->oldestXmin, xid);
+}
+
+TransactionId
+GetOldestXmin(void)
+{
+	Assert(get_csnlog_status());
+	return pg_atomic_read_u32(&csnShared->oldestXmin);
+}
diff --git a/src/backend/access/transam/csn_snapshot.c b/src/backend/access/transam/csn_snapshot.c
new file mode 100644
index 0000000000..a381d219ea
--- /dev/null
+++ b/src/backend/access/transam/csn_snapshot.c
@@ -0,0 +1,687 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.c
+ *		Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_snapshot.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_snapshot.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "portability/instr_time.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+#include "miscadmin.h"
+
+/* Raise a warning if imported snapshot_csn exceeds ours by this value. */
+#define SNAP_DESYNC_COMPLAIN (1*NSECS_PER_SEC) /* 1 second */
+
+static TransactionId xmin_for_csn = InvalidTransactionId;
+
+
+/*
+ * GUC to delay advance of oldestXid for this amount of time. Also determines
+ * the size CSNSnapshotXidMap circular buffer.
+ */
+int csn_snapshot_defer_time;
+
+int csn_time_shift;
+
+/*
+ * CSNSnapshotXidMap
+ *
+ * To be able to install csn snapshot that points to past we need to keep
+ * old versions of tuples and therefore delay advance of oldestXid.  Here we
+ * keep track of correspondence between snapshot's snapshot_csn and oldestXid
+ * that was set at the time when the snapshot was taken.  Much like the
+ * snapshot too old's OldSnapshotControlData does, but with finer granularity
+ * to seconds.
+ *
+ * Different strategies can be employed to hold oldestXid (e.g. we can track
+ * oldest csn-based snapshot among cluster nodes and map it oldestXid
+ * on each node).
+ *
+ * On each snapshot acquisition CSNSnapshotMapXmin() is called and stores
+ * correspondence between current snapshot_csn and oldestXmin in a sparse way:
+ * snapshot_csn is rounded to seconds (and here we use the fact that snapshot_csn
+ * is just a timestamp) and oldestXmin is stored in the circular buffer where
+ * rounded snapshot_csn acts as an offset from current circular buffer head.
+ * Size of the circular buffer is controlled by csn_snapshot_defer_time GUC.
+ *
+ * When csn snapshot arrives we check that its
+ * snapshot_csn is still in our map, otherwise we'll error out with "snapshot too
+ * old" message.  If snapshot_csn is successfully mapped to oldestXid we move
+ * backend's pgxact->xmin to proc->originalXmin and fill pgxact->xmin to
+ * mapped oldestXid.  That way GetOldestXmin() can take into account backends
+ * with imported csn snapshot and old tuple versions will be preserved.
+ *
+ * Also while calculating oldestXmin for our map in presence of imported
+ * csn snapshots we should use proc->originalXmin instead of pgxact->xmin
+ * that was set during import.  Otherwise, we can create a feedback loop:
+ * xmin's of imported csn snapshots were calculated using our map and new
+ * entries in map going to be calculated based on that xmin's, and there is
+ * a risk to stuck forever with one non-increasing oldestXmin.  All other
+ * callers of GetOldestXmin() are using pgxact->xmin so the old tuple versions
+ * are preserved.
+ */
+typedef struct CSNSnapshotXidMap
+{
+	int				 head;				/* offset of current freshest value */
+	int				 size;				/* total size of circular buffer */
+	CSN_atomic		 last_csn_seconds;	/* last rounded csn that changed
+										 * xmin_by_second[] */
+	TransactionId   *xmin_by_second;	/* circular buffer of oldestXmin's */
+}
+CSNSnapshotXidMap;
+
+static CSNSnapshotXidMap *csnXidMap;
+
+
+/* Estimate shared memory space needed */
+Size
+CSNSnapshotShmemSize(void)
+{
+	Size	size = 0;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		size += sizeof(CSNSnapshotXidMap);
+		size += csn_snapshot_defer_time*sizeof(TransactionId);
+		size = MAXALIGN(size);
+	}
+
+	return size;
+}
+
+/* Init shared memory structures */
+void
+CSNSnapshotShmemInit()
+{
+	bool found;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		csnXidMap = ShmemInitStruct("csnXidMap",
+								   sizeof(CSNSnapshotXidMap),
+								   &found);
+		if (!found)
+		{
+			int i;
+
+			pg_atomic_init_u64(&csnXidMap->last_csn_seconds, 0);
+			csnXidMap->head = 0;
+			csnXidMap->size = csn_snapshot_defer_time;
+			csnXidMap->xmin_by_second =
+							ShmemAlloc(sizeof(TransactionId)*csnXidMap->size);
+
+			for (i = 0; i < csnXidMap->size; i++)
+				csnXidMap->xmin_by_second[i] = InvalidTransactionId;
+		}
+	}
+}
+
+/*
+ * CSNSnapshotStartup
+ *
+ * Set csnXidMap entries to oldestActiveXID during startup.
+ */
+void
+CSNSnapshotStartup(TransactionId oldestActiveXID)
+{
+	/*
+	 * Run only if we have initialized shared memory and csnXidMap
+	 * is enabled.
+	 */
+	if (IsNormalProcessingMode() &&
+		enable_csn_snapshot && csn_snapshot_defer_time > 0)
+	{
+		int i;
+
+		Assert(TransactionIdIsValid(oldestActiveXID));
+		for (i = 0; i < csnXidMap->size; i++)
+			csnXidMap->xmin_by_second[i] = oldestActiveXID;
+		ProcArraySetCSNSnapshotXmin(oldestActiveXID);
+
+		elog(LOG, "CSN map initialized with oldest active xid %u", oldestActiveXID);
+	}
+}
+
+/*
+ * CSNSnapshotMapXmin
+ *
+ * Maintain circular buffer of oldestXmins for several seconds in past. This
+ * buffer allows to shift oldestXmin in the past when backend is importing
+ * CSN snapshot. Otherwise old versions of tuples that were needed for
+ * this transaction can be recycled by other processes (vacuum, HOT, etc).
+ *
+ * Locking here is not trivial. Called upon each snapshot creation after
+ * ProcArrayLock is released. Such usage creates several race conditions. It
+ * is possible that backend who got csn called CSNSnapshotMapXmin()
+ * only after other backends managed to get snapshot and complete
+ * CSNSnapshotMapXmin() call, or even committed. This is safe because
+ *
+ *		* We already hold our xmin in MyPgXact, so our snapshot will not be
+ *		  harmed even though ProcArrayLock is released.
+ *
+ *		* snapshot_csn is always pessmistically rounded up to the next
+ *		  second.
+ *
+ *		* For performance reasons, xmin value for particular second is filled
+ *		  only once. Because of that instead of writing to buffer just our
+ *		  xmin (which is enough for our snapshot), we bump oldestXmin there --
+ *		  it mitigates the possibility of damaging someone else's snapshot by
+ *		  writing to the buffer too advanced value in case of slowness of
+ *		  another backend who generated csn earlier, but didn't manage to
+ *		  insert it before us.
+ *
+ *		* if CSNSnapshotMapXmin() founds a gap in several seconds between
+ *		  current call and latest completed call then it should fill that gap
+ *		  with latest known values instead of new one. Otherwise it is
+ *		  possible (however highly unlikely) that this gap also happend
+ *		  between taking snapshot and call to CSNSnapshotMapXmin() for some
+ *		  backend. And we are at risk to fill circullar buffer with
+ *		  oldestXmin's that are bigger then they actually were.
+ */
+void
+CSNSnapshotMapXmin(SnapshotCSN snapshot_csn)
+{
+	int offset, gap, i;
+	SnapshotCSN csn_seconds;
+	SnapshotCSN last_csn_seconds;
+	volatile TransactionId oldest_deferred_xmin;
+	TransactionId current_oldest_xmin, previous_oldest_xmin;
+	TransactionId ImportedXmin;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+	/*
+	 * Round up snapshot_csn to the next second -- pessimistically and safely.
+	 */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC + 1);
+
+	/*
+	 * Fast-path check. Avoid taking exclusive CSNSnapshotXidMapLock lock
+	 * if oldestXid was already written to xmin_by_second[] for this rounded
+	 * snapshot_csn.
+	 */
+	if (pg_atomic_read_u64(&csnXidMap->last_csn_seconds) >= csn_seconds)
+		return;
+
+	/* Ok, we have new entry (or entries) */
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_EXCLUSIVE);
+
+	/* Re-check last_csn_seconds under lock */
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (last_csn_seconds >= csn_seconds)
+	{
+		LWLockRelease(CSNSnapshotXidMapLock);
+		return;
+	}
+	pg_atomic_write_u64(&csnXidMap->last_csn_seconds, csn_seconds);
+
+	/*
+	 * Count oldest_xmin.
+	 *
+	 * It was possible to calculate oldest_xmin during corresponding snapshot
+	 * creation, but GetSnapshotData() intentionally reads only PgXact, but not
+	 * PgProc. And we need info about originalXmin (see comment to csnXidMap)
+	 * which is stored in PgProc because of threats in comments around PgXact
+	 * about extending it with new fields. So just calculate oldest_xmin again,
+	 * that anyway happens quite rarely.
+	 */
+
+	/*
+	 * Don't afraid here because csn_snapshot_xmin will hold border of
+	 * minimal non-removable from vacuuming.
+	 */
+	ImportedXmin = MyProc->xmin;
+	MyProc->xmin = MyProc->originalXmin;
+	current_oldest_xmin = GetOldestNonRemovableTransactionId(NULL);
+	MyProc->xmin = ImportedXmin;
+	Assert(TransactionIdIsNormal(current_oldest_xmin));
+
+	previous_oldest_xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	Assert(TransactionIdIsNormal(previous_oldest_xmin) || !enable_csn_snapshot);
+
+	gap = csn_seconds - last_csn_seconds;
+	offset = csn_seconds % csnXidMap->size;
+
+	/* Sanity check before we update head and gap */
+	Assert( gap >= 1 );
+	Assert( (csnXidMap->head + gap) % csnXidMap->size == offset );
+
+	gap = gap > csnXidMap->size ? csnXidMap->size : gap;
+	csnXidMap->head = offset;
+
+	/* Fill new entry with current_oldest_xmin */
+	csnXidMap->xmin_by_second[offset] = current_oldest_xmin;
+
+	/*
+	 * If we have gap then fill it with previous_oldest_xmin for reasons
+	 * outlined in comment above this function.
+	 */
+	for (i = 1; i < gap; i++)
+	{
+		offset = (offset + csnXidMap->size - 1) % csnXidMap->size;
+		csnXidMap->xmin_by_second[offset] = previous_oldest_xmin;
+	}
+
+	oldest_deferred_xmin =
+		csnXidMap->xmin_by_second[ (csnXidMap->head + 1) % csnXidMap->size ];
+
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	elog(DEBUG5, "Advance xmin for CSN. Oldest deferred xmin = %u",
+		 oldest_deferred_xmin);
+
+	/*
+	 * Advance procArray->csn_snapshot_xmin after we released
+	 * CSNSnapshotXidMapLock. Since we gather not xmin but oldestXmin, it
+	 * never goes backwards regardless of how slow we can do that.
+	 */
+	/*Assert(TransactionIdFollowsOrEquals(oldest_deferred_xmin,
+										ProcArrayGetCSNSnapshotXmin()));*/
+	ProcArraySetCSNSnapshotXmin(oldest_deferred_xmin);
+}
+
+
+/*
+ * CSNSnapshotToXmin
+ *
+ * Get oldestXmin that took place when snapshot_csn was taken.
+ */
+TransactionId
+CSNSnapshotToXmin(SnapshotCSN snapshot_csn)
+{
+	TransactionId xmin;
+	SnapshotCSN csn_seconds;
+	volatile SnapshotCSN last_csn_seconds;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+
+	/* Round down to get conservative estimates */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC);
+
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_SHARED);
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (csn_seconds > last_csn_seconds)
+	{
+		/* we don't have entry for this snapshot_csn yet, return latest known */
+		xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	}
+	else if (last_csn_seconds - csn_seconds < csnXidMap->size)
+	{
+		/* we are good, retrieve value from our map */
+		Assert(last_csn_seconds % csnXidMap->size == csnXidMap->head);
+		xmin = csnXidMap->xmin_by_second[csn_seconds % csnXidMap->size];
+	}
+	else
+	{
+		/* requested snapshot_csn is too old, let caller know */
+		xmin = InvalidTransactionId;
+	}
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	return xmin;
+}
+
+/*
+ * CSNSnapshotPrepareCurrent
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+SnapshotCSN
+CSNSnapshotPrepareCurrent(void)
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (TransactionIdIsValid(xid))
+	{
+		TransactionId *subxids;
+		int nsubxids = xactGetCommittedChildren(&subxids);
+		CSNLogSetCSN(xid, nsubxids, subxids, InDoubtCSN, true);
+	}
+
+	/* Nothing to write if we don't have xid */
+
+	return GenerateCSN(false, InvalidCSN);
+}
+
+
+/*
+ * CSNSnapshotAssignCurrent
+ *
+ * Assign SnapshotCSN to the currently active transaction. SnapshotCSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ */
+void
+CSNSnapshotAssignCurrent(SnapshotCSN snapshot_csn)
+{
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(snapshot_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal snapshot_csn")));
+
+	Assert(snapshot_csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnShared->last_max_csn value.
+	 */
+	GenerateCSN(false, snapshot_csn);
+
+	/* Set csn and defuse ProcArrayEndTransaction from assigning one */
+	pg_atomic_write_u64(&MyProc->assignedCSN, snapshot_csn);
+}
+
+/*
+ * CSNSnapshotSync
+ *
+ * Due to time desynchronization on different nodes we can receive snapshot_csn
+ * which is greater than snapshot_csn on this node. To preserve proper isolation
+ * this node needs to wait when such snapshot_csn comes on local clock.
+ *
+ * This should happend relatively rare if nodes have running NTP/PTP/etc.
+ * Complain if wait time is more than SNAP_SYNC_COMPLAIN.
+ */
+void
+CSNSnapshotSync(SnapshotCSN remote_csn)
+{
+	SnapshotCSN	local_csn;
+	SnapshotCSN	delta;
+
+	Assert(enable_csn_snapshot);
+
+	for(;;)
+	{
+		if (GetLastGeneratedCSN() > remote_csn)
+			return;
+
+		local_csn = GenerateCSN(true, InvalidCSN);
+
+		if (local_csn >= remote_csn)
+			/*
+			 * Everything is fine too, but last_max_csn wasn't updated for
+			 * some time.
+			 */
+			return;
+
+		/* Okay we need to sleep now */
+		delta = remote_csn - local_csn;
+		if (delta > SNAP_DESYNC_COMPLAIN)
+			ereport(WARNING,
+				(errmsg("remote global snapshot exceeds ours by more than a second"),
+				 errhint("Consider running NTPd on servers participating in global transaction")));
+
+		/* TODO: report this sleeptime somewhere? */
+		pg_usleep((long) (delta/NSECS_PER_USEC));
+
+		/*
+		 * Loop that checks to ensure that we actually slept for specified
+		 * amount of time.
+		 */
+	}
+
+	Assert(false); /* Should not happend */
+	return;
+}
+
+/*
+ * TransactionIdGetCSN
+ *
+ * Get CSN for specified TransactionId taking care about special xids,
+ * xids beyond TransactionXmin and InDoubt states.
+ */
+CSN
+TransactionIdGetCSN(TransactionId xid)
+{
+	CSN csn;
+
+	/* Handle permanent TransactionId's for which we don't have mapping */
+	if (!TransactionIdIsNormal(xid))
+	{
+		if (xid == InvalidTransactionId)
+			return AbortedCSN;
+		if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+			return FrozenCSN;
+		Assert(false); /* Should not happend */
+	}
+
+	/*
+	 * If we just switch a xid-snapsot to a csn_snapshot, we should handle a start
+	 * xid for csn base check. Just in case we have prepared transaction which
+	 * hold the TransactionXmin but without CSN.
+	 */
+	xmin_for_csn = GetOldestXmin();
+
+	/*
+	 * For the xid with 'xid >= TransactionXmin and xid < xmin_for_csn',
+	 * it defined as unclear csn which follow xid-snapshot result.
+	 */
+	if(!TransactionIdPrecedes(xid, TransactionXmin) &&
+							TransactionIdPrecedes(xid, xmin_for_csn))
+	{
+		elog(LOG, "UnclearCSN was returned. xid=%u, TransactionXmin=%u, xmin_for_csn=%u",
+			xid, TransactionXmin, xmin_for_csn);
+		return UnclearCSN;
+	}
+	/*
+	 * For xids which less then TransactionXmin CSNLog can be already
+	 * trimmed but we know that such transaction is definitely not concurrently
+	 * running according to any snapshot including timetravel ones. Callers
+	 * should check TransactionDidCommit after.
+	 */
+	if (TransactionIdPrecedes(xid, TransactionXmin))
+		return FrozenCSN;
+
+	/* Read CSN from SLRU */
+	csn = CSNLogGetCSNByXid(xid);
+
+	/*
+	 * If we faced InDoubt state then transaction is being committed and we
+	 * should wait until CSN will be assigned so that visibility check
+	 * could decide whether tuple is in snapshot. See also comments in
+	 * CSNSnapshotPrecommit().
+	 */
+	if (CSNIsInDoubt(csn))
+	{
+		XactLockTableWait(SubTransGetTopmostTransaction(xid), NULL, NULL, XLTW_None);
+		csn = CSNLogGetCSNByXid(xid);
+		Assert(CSNIsNormal(csn) || CSNIsAborted(csn));
+	}
+
+	Assert(CSNIsNormal(csn) || CSNIsInProgress(csn) || CSNIsAborted(csn));
+	return csn;
+}
+
+/*
+ * XidInCSNSnapshot
+ *
+ * Version of XidInMVCCSnapshot for transactions. For non-imported
+ * csn snapshots this should give same results as XidInLocalMVCCSnapshot
+ * (except that aborts will be shown as invisible without going to clog) and to
+ * ensure such behaviour XidInMVCCSnapshot is coated with asserts that checks
+ * identicalness of XidInCSNSnapshot/XidInLocalMVCCSnapshot in
+ * case of ordinary snapshot.
+ */
+bool
+XidInCSNSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	CSN csn;
+
+	csn = TransactionIdGetCSN(xid);
+
+	if (CSNIsNormal(csn))
+		return (csn >= snapshot->snapshot_csn);
+	else if (CSNIsFrozen(csn))
+	{
+		/* It is bootstrap or frozen transaction */
+		return false;
+	}
+	else if(CSNIsUnclear(csn))
+	{
+		/*
+		 * Some xid can not figure out csn because of snapshot switch,
+		 * and we can follow xid-base result.
+		 */
+		return true;
+	}
+	else
+	{
+		/* It is aborted or in-progress */
+		Assert(CSNIsAborted(csn) || CSNIsInProgress(csn));
+		if (CSNIsAborted(csn))
+			Assert(TransactionIdDidAbort(xid));
+		return true;
+	}
+}
+
+
+/*****************************************************************************
+ * Functions to handle transactions commit.
+ *
+ * For local transactions CSNSnapshotPrecommit sets InDoubt state before
+ * ProcArrayEndTransaction is called and transaction data potetntially becomes
+ * visible to other backends. ProcArrayEndTransaction (or ProcArrayRemove in
+ * twophase case) then acquires csn under ProcArray lock and stores it
+ * in proc->assignedCSN. It's important that csn for commit is
+ * generated under ProcArray lock, otherwise snapshots won't
+ * be equivalent. Consequent call to CSNSnapshotCommit will write
+ * proc->assignedCSN to CSNLog.
+ *
+ *
+ * CSNSnapshotAbort is slightly different comparing to commit because abort
+ * can skip InDoubt phase and can be called for transaction subtree.
+ *****************************************************************************/
+
+
+/*
+ * CSNSnapshotAbort
+ *
+ * Abort transaction in CsnLog. We can skip InDoubt state for aborts
+ * since no concurrent transactions allowed to see aborted data anyway.
+ */
+void
+CSNSnapshotAbort(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	if (!get_csnlog_status())
+		return;
+
+	CSNLogSetCSN(xid, nsubxids, subxids, AbortedCSN, true);
+
+	/*
+	 * Clean assignedCSN anyway, as it was possibly set in
+	 * XidSnapshotAssignCsnCurrent.
+	 */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
+
+/*
+ * CSNSnapshotPrecommit
+ *
+ * Set InDoubt status for local transaction that we are going to commit.
+ * This step is needed to achieve consistency between local snapshots and
+ * csn-based snapshots. We don't hold ProcArray lock while writing
+ * csn for transaction in SLRU but instead we set InDoubt status before
+ * transaction is deleted from ProcArray so the readers who will read csn
+ * in the gap between ProcArray removal and CSN assignment can wait
+ * until CSN is finally assigned. See also TransactionIdGetCSN().
+ *
+ * This should be called only from parallel group leader before backend is
+ * deleted from ProcArray.
+ */
+void
+CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	CSN oldassignedCSN = InProgressCSN;
+	bool in_progress;
+
+	if (!get_csnlog_status())
+		return;
+
+	/* Set InDoubt status if it is local transaction */
+	in_progress = pg_atomic_compare_exchange_u64(&proc->assignedCSN,
+												 &oldassignedCSN,
+												 InDoubtCSN);
+	if (in_progress)
+	{
+		Assert(CSNIsInProgress(oldassignedCSN));
+		CSNLogSetCSN(xid, nsubxids, subxids, InDoubtCSN, true);
+	}
+	else
+	{
+		/* Otherwise we should have valid CSN by this time */
+		Assert(CSNIsNormal(oldassignedCSN));
+		Assert(CSNIsInDoubt(CSNLogGetCSNByXid(xid)));
+	}
+}
+
+/*
+ * CSNSnapshotCommit
+ *
+ * Write CSN that were acquired earlier to CsnLog. Should be
+ * preceded by CSNSnapshotPrecommit() so readers can wait until we finally
+ * finished writing to SLRU.
+ *
+ * Should be called after ProcArrayEndTransaction, but before releasing
+ * transaction locks, so that TransactionIdGetCSN can wait on this
+ * lock for CSN.
+ */
+void
+CSNSnapshotCommit(PGPROC *proc, TransactionId xid,
+				  int nsubxids, TransactionId *subxids)
+{
+	volatile CSN assignedCSN;
+
+	if (!get_csnlog_status())
+		return;
+
+	if (!TransactionIdIsValid(xid))
+	{
+		assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+		Assert(CSNIsInProgress(assignedCSN));
+		return;
+	}
+
+	/* Finally write resulting CSN in SLRU */
+	assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+	Assert(CSNIsNormal(assignedCSN));
+	CSNLogSetCSN(xid, nsubxids, subxids, assignedCSN, true);
+
+	/* Reset for next transaction */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..b86c172e46 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -10,6 +10,7 @@
 #include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/generic_xlog.h"
 #include "access/ginxlog.h"
 #include "access/gistxlog.h"
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 28b153abc3..7bc6aae9a4 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
@@ -1536,8 +1537,34 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nabortrels, abortrels,
 									   gid);
 
+	/*
+	 * CSNSnapshot callbacks that should be called right before we are
+	 * going to become visible. Details in comments to this functions.
+	 */
+	if (isCommit)
+		CSNSnapshotPrecommit(proc, xid, hdr->nsubxacts, children);
+	else
+		CSNSnapshotAbort(proc, xid, hdr->nsubxacts, children);
+
+
 	ProcArrayRemove(proc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CSNLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks, since TransactionIdGetCSN relies on
+	 * XactLockTableWait to await csn.
+	 */
+	if (isCommit)
+	{
+		CSNSnapshotCommit(proc, xid, hdr->nsubxacts, children);
+	}
+	else
+	{
+		Assert(CSNIsInProgress(
+				   pg_atomic_read_u64(&proc->assignedCSN)));
+	}
+
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
 	 * no one else will try to commit/rollback, and so it will be recycled if
@@ -2583,3 +2610,130 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
 	LWLockRelease(TwoPhaseStateLock);
 	return found;
 }
+
+/*
+ * CSNSnapshotPrepareTwophase
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+static SnapshotCSN
+CSNSnapshotPrepareTwophase(const char *gid)
+{
+	GlobalTransaction	gxact;
+	PGPROC				*proc;
+	char				*buf;
+	TransactionId		xid;
+	xl_xact_parsed_prepare parsed;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+	xid = proc->xid;
+
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, true);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+	ParsePrepareRecord(0, (xl_xact_prepare *)buf, &parsed);
+
+	CSNLogSetCSN(xid, parsed.nsubxacts,
+					parsed.subxacts, InDoubtCSN, true);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	pfree(buf);
+	return GenerateCSN(false, InvalidCSN);
+}
+
+/*
+ * CSNSnapshotAssignTwoPhase
+ *
+ * Asign SnapshotCSN for currently active transaction. SnapshotCSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ *
+ * This function is a counterpart of CSNSnapshotAssignCurrent() for
+ * twophase transactions.
+ */
+static void
+CSNSnapshotAssignTwoPhase(const char *gid, SnapshotCSN csn)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal snapshot_csn")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	Assert(csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnShared->last_max_csn value.
+	 */
+	GenerateCSN(false, csn);
+	/* Set snapshot_csn and defuse ProcArrayRemove from assigning one. */
+	pg_atomic_write_u64(&proc->assignedCSN, csn);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * SQL interface to CSNSnapshotPrepareTwophase()
+ *
+ * TODO: Rewrite this as PREPARE TRANSACTION 'gid' RETURNING SNAPSHOT
+ */
+Datum
+pg_csn_snapshot_prepare(PG_FUNCTION_ARGS)
+{
+	const char 	*gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	SnapshotCSN	csn = CSNSnapshotPrepareTwophase(gid);
+
+	PG_RETURN_INT64(csn);
+}
+
+/*
+ * SQL interface to CSNSnapshotAssignTwoPhase()
+ *
+ * TODO: Rewrite this as COMMIT PREPARED 'gid' SNAPSHOT 'csn'
+ */
+Datum
+pg_csn_snapshot_assign(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	SnapshotCSN	csn = PG_GETARG_INT64(1);
+
+	CSNSnapshotAssignTwoPhase(gid, csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index a6e98e71bd..8e1d074806 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -175,6 +176,7 @@ GetNewTransactionId(bool isSubXact)
 	 * Extend pg_subtrans and pg_commit_ts too.
 	 */
 	ExtendCLOG(xid);
+	ExtendCSNLog(xid);
 	ExtendCommitTs(xid);
 	ExtendSUBTRANS(xid);
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 8e35c432f5..e6baf880d9 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -1418,6 +1419,12 @@ RecordTransactionCommit(void)
 		TransactionTreeSetCommitTsData(xid, nchildren, children,
 									   replorigin_session_origin_timestamp,
 									   replorigin_session_origin);
+
+		/*
+		 * Mark our transaction as InDoubt in CsnLog and get ready for
+		 * commit.
+		 */
+		CSNSnapshotPrecommit(MyProc, xid, nchildren, children);
 	}
 
 	/*
@@ -1772,6 +1779,9 @@ RecordTransactionAbort(bool isSubXact)
 	 */
 	TransactionIdAbortTree(xid, nchildren, children);
 
+	/* Mark our transaction as Aborted in CSN Log. */
+	CSNSnapshotAbort(MyProc, xid, nchildren, children);
+
 	END_CRIT_SECTION();
 
 	/* Compute latestXid while we have the child XIDs handy */
@@ -2114,6 +2124,13 @@ StartTransaction(void)
 	ShowTransactionState("StartTransaction");
 }
 
+Datum
+pg_current_csn(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN	csn = GenerateCSN(false, InvalidCSN);
+
+	PG_RETURN_INT64(csn);
+}
 
 /*
  *	CommitTransaction
@@ -2262,6 +2279,21 @@ CommitTransaction(void)
 	 */
 	ProcArrayEndTransaction(MyProc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks.
+	 */
+	if (!is_parallel_worker)
+	{
+		TransactionId xid = GetTopTransactionIdIfAny();
+		TransactionId *subxids;
+		int nsubxids;
+
+		nsubxids = xactGetCommittedChildren(&subxids);
+		CSNSnapshotCommit(MyProc, xid, nsubxids, subxids);
+	}
+
 	/*
 	 * This is all post-commit cleanup.  Note that if an error is raised here,
 	 * it's too late to abort the transaction.  This should be just
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1616448368..2a8de10038 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -4747,6 +4748,7 @@ InitControlFile(uint64 sysidentifier)
 	ControlFile->wal_level = wal_level;
 	ControlFile->wal_log_hints = wal_log_hints;
 	ControlFile->track_commit_timestamp = track_commit_timestamp;
+	ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
 }
 
@@ -7181,6 +7183,9 @@ StartupXLOG(void)
 	if (ControlFile->track_commit_timestamp)
 		StartupCommitTs();
 
+	if(ControlFile->enable_csn_snapshot)
+		StartupCSN();
+
 	/*
 	 * Recover knowledge about replay progress of known replication partners.
 	 */
@@ -7448,6 +7453,8 @@ StartupXLOG(void)
 			 */
 			StartupSUBTRANS(oldestActiveXID);
 
+			CSNSnapshotStartup(oldestActiveXID);
+
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
 			 * nothing was running on the primary at this point. So fake-up an
@@ -8117,7 +8124,10 @@ StartupXLOG(void)
 	 * timestamps are started below, if necessary.)
 	 */
 	if (standbyState == STANDBY_DISABLED)
+	{
 		StartupSUBTRANS(oldestActiveXID);
+		CSNSnapshotStartup(oldestActiveXID);
+	}
 
 	/*
 	 * Perform end of recovery actions for any SLRUs that need it.
@@ -8183,6 +8193,7 @@ StartupXLOG(void)
 	 * commit timestamp.
 	 */
 	CompleteCommitTsInitialization();
+	CompleteCSNInitialization();
 
 	/*
 	 * All done with end-of-recovery actions.
@@ -9616,6 +9627,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	CheckPointCLOG();
+	CheckPointCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -9894,7 +9906,10 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
+	{
 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+		TruncateCSNLog(GetOldestTransactionIdConsideredRunning());
+	}
 
 	/* Real work is done; log and update stats. */
 	LogCheckpointEnd(true);
@@ -10172,7 +10187,8 @@ XLogReportParameters(void)
 		max_wal_senders != ControlFile->max_wal_senders ||
 		max_prepared_xacts != ControlFile->max_prepared_xacts ||
 		max_locks_per_xact != ControlFile->max_locks_per_xact ||
-		track_commit_timestamp != ControlFile->track_commit_timestamp)
+		track_commit_timestamp != ControlFile->track_commit_timestamp ||
+		enable_csn_snapshot != ControlFile->enable_csn_snapshot)
 	{
 		/*
 		 * The change in number of backend slots doesn't need to be WAL-logged
@@ -10194,6 +10210,7 @@ XLogReportParameters(void)
 			xlrec.wal_level = wal_level;
 			xlrec.wal_log_hints = wal_log_hints;
 			xlrec.track_commit_timestamp = track_commit_timestamp;
+			xlrec.enable_csn_snapshot = enable_csn_snapshot;
 
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
@@ -10212,6 +10229,7 @@ XLogReportParameters(void)
 		ControlFile->wal_level = wal_level;
 		ControlFile->wal_log_hints = wal_log_hints;
 		ControlFile->track_commit_timestamp = track_commit_timestamp;
+		ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 		UpdateControlFile();
 
 		LWLockRelease(ControlFileLock);
@@ -10665,6 +10683,9 @@ xlog_redo(XLogReaderState *record)
 		CommitTsParameterChange(xlrec.track_commit_timestamp,
 								ControlFile->track_commit_timestamp);
 		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+		CSNlogParameterChange(xlrec.enable_csn_snapshot,
+								ControlFile->enable_csn_snapshot);
+		ControlFile->enable_csn_snapshot = xlrec.enable_csn_snapshot;
 
 		UpdateControlFile();
 		LWLockRelease(ControlFileLock);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 689384a411..e6585a94ba 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -73,6 +73,8 @@ typedef struct
 	char		compressed_page[COMPRESS_BUFSIZE];
 } registered_buffer;
 
+bool enable_csn_wal = true;
+
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
 static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5c4bc15b44..e64ada86c7 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -53,7 +53,7 @@
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
-
+#include "access/csn_log.h"
 
 /*
  * GUC parameters
@@ -1760,6 +1760,7 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	TruncateCLOG(frozenXID, oldestxid_datoid);
 	TruncateCommitTs(frozenXID);
+	TruncateCSNLog(frozenXID);
 	TruncateMultiXact(minMulti, minmulti_datoid);
 
 	/*
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 9fa3e0631e..2a7e184da9 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,8 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
+#include "access/csn_snapshot.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -120,6 +122,8 @@ CalculateShmemSize(int *num_semaphores)
 	size = add_size(size, ProcGlobalShmemSize());
 	size = add_size(size, XLOGShmemSize());
 	size = add_size(size, CLOGShmemSize());
+	size = add_size(size, CSNLogShmemSize());
+	size = add_size(size, CSNSnapshotShmemSize());
 	size = add_size(size, CommitTsShmemSize());
 	size = add_size(size, SUBTRANSShmemSize());
 	size = add_size(size, TwoPhaseShmemSize());
@@ -242,6 +246,8 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	CSNLogShmemInit();
+	CSNSnapshotShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 892f0f6799..5bc7370c73 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -48,6 +48,7 @@
 #include <signal.h>
 
 #include "access/clog.h"
+#include "access/csn_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -96,6 +97,8 @@ typedef struct ProcArrayStruct
 	TransactionId replication_slot_xmin;
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
+	/* xmin of oldest active csn snapshot */
+	TransactionId csn_snapshot_xmin;
 
 	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
@@ -429,6 +432,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		procArray->csn_snapshot_xmin = InvalidTransactionId;
 		ShmemVariableCache->xactCompletionCount = 1;
 	}
 
@@ -577,6 +581,14 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		/* Advance global latestCompletedXid while holding the lock */
 		MaintainLatestCompletedXid(latestXid);
 
+		/*
+		 * Assign xid csn while holding ProcArrayLock for non-distributed
+		 * COMMIT PREPARED. After lock is released consequent
+		 * CSNSnapshotCommit() will write this value to CsnLog.
+		 */
+		if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+			pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(false, InvalidCSN));
+
 		/* Same with xactCompletionCount  */
 		ShmemVariableCache->xactCompletionCount++;
 
@@ -691,6 +703,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		proc->xmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
+		proc->originalXmin = InvalidTransactionId;
 
 		/* must be cleared with xid/xmin: */
 		/* avoid unnecessarily dirtying shared cachelines */
@@ -730,6 +743,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	proc->xmin = InvalidTransactionId;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
 	proc->recoveryConflictPending = false;
+	proc->originalXmin = InvalidTransactionId;
 
 	/* must be cleared with xid/xmin: */
 	/* avoid unnecessarily dirtying shared cachelines */
@@ -753,6 +767,16 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
 
+	/*
+	 * Assign xid csn while holding ProcArrayLock for
+	 * COMMIT.
+	 *
+	 * TODO: in case of group commit we can generate one CSNSnapshot for
+	 * whole group to save time on timestamp aquisition.
+	 */
+	if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+		pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(false, InvalidCSN));
+
 	/* Same with xactCompletionCount  */
 	ShmemVariableCache->xactCompletionCount++;
 }
@@ -912,6 +936,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
+	proc->originalXmin = InvalidTransactionId;
 
 	Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
 	Assert(!proc->delayChkpt);
@@ -1204,6 +1229,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
 	{
 		ExtendSUBTRANS(latestObservedXid);
+		ExtendCSNLog(latestObservedXid);
 		TransactionIdAdvance(latestObservedXid);
 	}
 	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
@@ -1704,6 +1730,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	TransactionId kaxmin;
 	bool		in_recovery = RecoveryInProgress();
 	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -1843,6 +1870,10 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	if (in_recovery)
 		kaxmin = KnownAssignedXidsGetOldestXmin();
 
+	/* Get value of xmin, delayed by a CSN snapshot settings. */
+	if (get_csnlog_status() && csn_snapshot_defer_time > 0 && IsUnderPostmaster)
+		csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
+
 	/*
 	 * No other information from shared state is needed, release the lock
 	 * immediately. The rest of the computations can be done without a lock.
@@ -1899,6 +1930,15 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	h->data_oldest_nonremovable =
 		TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
 
+	/*
+	 * Hold non-removable border because distributed transactions
+	 * can wish to see old data.
+	 */
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable, csn_snapshot_xmin);
+	h->data_oldest_nonremovable =
+		TransactionIdOlder(h->data_oldest_nonremovable, csn_snapshot_xmin);
+
 	/*
 	 * The only difference between catalog / data horizons is that the slot's
 	 * catalog xmin is applied to the catalog one (so catalogs can be accessed
@@ -2133,6 +2173,9 @@ GetSnapshotDataReuse(Snapshot snapshot)
 	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
 		return false;
 
+	if (get_csnlog_status())
+		return false;
+
 	/*
 	 * If the current xactCompletionCount is still the same as it was at the
 	 * time the snapshot was built, we can be sure that rebuilding the
@@ -2212,6 +2255,8 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	CSN			csn = FrozenCSN;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 	FullTransactionId latest_completed;
 	TransactionId oldestxid;
 	int			mypgxactoff;
@@ -2444,6 +2489,20 @@ GetSnapshotData(Snapshot snapshot)
 	if (!TransactionIdIsValid(MyProc->xmin))
 		MyProc->xmin = TransactionXmin = xmin;
 
+	/* Take CSN under ProcArrayLock so the snapshot stays synchronized. */
+	if (!snapshot->takenDuringRecovery && get_csnlog_status())
+		csn = GenerateCSN(false, InvalidCSN);
+
+	if (get_csnlog_status() && csn_snapshot_defer_time > 0 && IsUnderPostmaster)
+	{
+		CSNSnapshotMapXmin(snapshot->snapshot_csn);
+
+		/* Get value of xmin, delayed by a CSN snapshot settings. */
+		csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
+		/* Adjust an oldest xid value with a xmin, delayed by CSN options. */
+		oldestxid = TransactionIdOlder(oldestxid, csn_snapshot_xmin);
+	}
+
 	LWLockRelease(ProcArrayLock);
 
 	/* maintain state for GlobalVis* */
@@ -2469,6 +2528,10 @@ GetSnapshotData(Snapshot snapshot)
 		def_vis_xid_data =
 			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
 
+		/* The csn-related settings can require an older xmin. */
+		def_vis_xid_data =
+			TransactionIdOlder(def_vis_xid_data, csn_snapshot_xmin);
+
 		/*
 		 * Rows in non-shared, non-catalog tables possibly could be vacuumed
 		 * if older than this xid.
@@ -2549,6 +2612,8 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->active_count = 0;
 	snapshot->regd_count = 0;
 	snapshot->copied = false;
+	snapshot->imported_csn = false;
+	snapshot->snapshot_csn = csn;
 
 	GetSnapshotDataInitOldSnapshot(snapshot);
 
@@ -3901,6 +3966,25 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
+/*
+ * ProcArraySetCSNSnapshotXmin
+ */
+void
+ProcArraySetCSNSnapshotXmin(TransactionId xmin)
+{
+	/* We rely on atomic fetch/store of xid */
+	procArray->csn_snapshot_xmin = xmin;
+}
+
+/*
+ * ProcArrayGetCSNSnapshotXmin
+ */
+TransactionId
+ProcArrayGetCSNSnapshotXmin(void)
+{
+	return procArray->csn_snapshot_xmin;
+}
+
 /*
  * XidCacheRemoveRunningXids
  *
@@ -4383,6 +4467,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
+			ExtendCSNLog(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 862097352b..1f78161d9a 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -135,6 +135,8 @@ static const char *const BuiltinTrancheNames[] = {
 	"CommitTSBuffer",
 	/* LWTRANCHE_SUBTRANS_BUFFER: */
 	"SubtransBuffer",
+	/* LWTRANCHE_CSN_LOG_BUFFERS */
+	"CSNLogBuffer",
 	/* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
 	"MultiXactOffsetBuffer",
 	/* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c295..e8ca393611 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,5 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+CSNLogSLRULock				    	48
+CSNSnapshotXidMapLock			    49
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index b7d9da0aa9..88f4f42456 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -35,9 +35,11 @@
 #include <unistd.h>
 #include <sys/time.h>
 
+#include "access/csn_snapshot.h"
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xlogutils.h"
+#include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
@@ -440,6 +442,9 @@ InitProcess(void)
 	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
 	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
 
+	MyProc->originalXmin = InvalidTransactionId;
+	pg_atomic_init_u64(&MyProc->assignedCSN, InProgressCSN);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
@@ -585,6 +590,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
 	MyProc->waitProcLock = NULL;
+	MyProc->originalXmin = InvalidTransactionId;
 	pg_atomic_write_u64(&MyProc->waitStart, 0);
 #ifdef USE_ASSERT_CHECKING
 	{
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index d4083e8a56..383f1e4566 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -20,6 +20,7 @@
 
 #include "access/commit_ts.h"
 #include "access/clog.h"
+#include "access/csn_log.h"
 #include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
@@ -119,6 +120,10 @@ static const SyncOps syncsw[] = {
 	/* pg_multixact/members */
 	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
 		.sync_syncfiletag = multixactmemberssyncfiletag
+	},
+	/* pg_multixact/members */
+	[SYNC_HANDLER_CSN] = {
+		.sync_syncfiletag = csnsyncfiletag
 	}
 };
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e91d5a3cfd..4d9833fb5f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -34,6 +34,7 @@
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
+#include "access/csn_snapshot.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
 #include "access/toast_compression.h"
@@ -1212,6 +1213,24 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_csn_snapshot", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-base snapshot."),
+			gettext_noop("Used to achieve REPEATABLE READ isolation level for postgres_fdw transactions.")
+		},
+		&enable_csn_snapshot,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_csn_wal", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-wal record."),
+			gettext_noop("Used to enable csn-wal record")
+		},
+		&enable_csn_wal,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"ssl", PGC_SIGHUP, CONN_AUTH_SSL,
 			gettext_noop("Enables SSL connections."),
@@ -3195,6 +3214,24 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"csn_snapshot_defer_time", PGC_POSTMASTER, REPLICATION_PRIMARY,
+			gettext_noop("Minimal age of records which allowed to be vacuumed, in seconds."),
+			NULL
+		},
+		&csn_snapshot_defer_time,
+		0, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+	{
+		{"csn_time_shift", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Do the time shift in the CSN generator."),
+			gettext_noop("Used for debug purposes.")
+		},
+		&csn_time_shift,
+		0, INT_MIN, INT_MAX,
+		NULL, NULL, NULL
+	},
 	{
 		{"block_size", PGC_INTERNAL, PRESET_OPTIONS,
 			gettext_noop("Shows the size of a disk block."),
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index b0c50a3c7f..3fcd0f4ccf 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe csnlog__checkpoint__start(bool);
+	probe csnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5001efdf7a..eaf2082e41 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -77,6 +78,8 @@
  */
 int			old_snapshot_threshold; /* number of minutes, -1 disables */
 
+bool		enable_csn_snapshot;
+
 volatile OldSnapshotControlData *oldSnapshotControl;
 
 
@@ -173,6 +176,7 @@ static TimestampTz AlignTimestampToMinuteBoundary(TimestampTz ts);
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
 static void SnapshotResetXmin(void);
+static bool XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
 /*
  * Snapshot fields to be serialized.
@@ -191,6 +195,8 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	CSN			csn;
+	bool		imported_csn;
 } SerializedSnapshotData;
 
 Size
@@ -2130,6 +2136,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.csn = snapshot->snapshot_csn;
+	serialized_snapshot.imported_csn = snapshot->imported_csn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -2204,6 +2212,8 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->snapshot_csn = serialized_snapshot.csn;
+	snapshot->imported_csn = serialized_snapshot.imported_csn;
 	snapshot->snapXactCompletionCount = 0;
 
 	/* Copy XIDs, if present. */
@@ -2245,6 +2255,44 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 
 /*
  * XidInMVCCSnapshot
+ *
+ * Check whether this xid is in snapshot. When enable_csn_snapshot is
+ * switched off just call XidInLocalMVCCSnapshot().
+ */
+bool
+XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	bool in_snapshot;
+
+	if (snapshot->imported_csn)
+	{
+		Assert(enable_csn_snapshot);
+		/* No point to using snapshot info except CSN */
+		return XidInCSNSnapshot(xid, snapshot);
+	}
+
+	in_snapshot = XidInLocalMVCCSnapshot(xid, snapshot);
+
+	if (!get_csnlog_status())
+	{
+		Assert(CSNIsFrozen(snapshot->snapshot_csn));
+		return in_snapshot;
+	}
+
+	if (in_snapshot)
+	{
+		/*
+		 * This xid may be already in unknown state and in that case
+		 * we must wait and recheck.
+		 */
+		return XidInCSNSnapshot(xid, snapshot);
+	}
+	else
+		return false;
+}
+
+/*
+ * XidInLocalMVCCSnapshot
  *		Is the given XID still-in-progress according to the snapshot?
  *
  * Note: GetSnapshotData never stores either top xid or subxids of our own
@@ -2253,8 +2301,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
  * TransactionIdIsCurrentTransactionId first, except when it's known the
  * XID could not be ours anyway.
  */
-bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+static bool
+XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
 	uint32		i;
 
@@ -2364,3 +2412,100 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+
+/*
+ * ExportCSNSnapshot
+ *
+ * Export snapshot_csn so that caller can expand this transaction to other
+ * nodes.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+SnapshotCSN
+ExportCSNSnapshot()
+{
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not export csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	elog(DEBUG5, "Export CSN Snapshot: csn = %lu",
+		 CurrentSnapshot->snapshot_csn);
+	return CurrentSnapshot->snapshot_csn;
+}
+
+/* SQL accessor to ExportCSNSnapshot() */
+Datum
+pg_csn_snapshot_export(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN csn = ExportCSNSnapshot();
+
+	PG_RETURN_UINT64(csn);
+}
+
+/*
+ * ImportCSNSnapshot
+ *
+ * Import csn and retract this backends xmin to the value that was
+ * actual when we had such csn.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+void
+ImportCSNSnapshot(SnapshotCSN snapshot_csn)
+{
+	volatile TransactionId xmin;
+
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	if (csn_snapshot_defer_time <= 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is positive.",
+					 "csn_snapshot_defer_time")));
+
+	/*
+	 * Call CSNSnapshotToXmin under ProcArrayLock to avoid situation that
+	 * resulting xmin will be evicted from map before we will set it into our
+	 * backend's xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	xmin = CSNSnapshotToXmin(snapshot_csn);
+	if (!TransactionIdIsValid(xmin))
+	{
+		LWLockRelease(ProcArrayLock);
+		elog(ERROR, "CSNSnapshotToXmin: csn snapshot too old");
+	}
+
+	MyProc->originalXmin = MyProc->xmin;
+	MyProc->xmin = TransactionXmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	CurrentSnapshot->xmin = xmin; /* defuse SnapshotResetXmin() */
+	CurrentSnapshot->snapshot_csn = snapshot_csn;
+	CurrentSnapshot->imported_csn = true;
+	CSNSnapshotSync(snapshot_csn);
+}
+
+/* SQL accessor to ImportCSNSnapshot() */
+Datum
+pg_csn_snapshot_import(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN csn = PG_GETARG_UINT64(0);
+
+	ImportCSNSnapshot(csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 31839c1a19..1864952bd2 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -227,7 +227,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_csn"
 };
 
 
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index f911f98d94..325e6a0e2b 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -300,6 +300,8 @@ main(int argc, char *argv[])
 		   ControlFile->max_locks_per_xact);
 	printf(_("track_commit_timestamp setting:       %s\n"),
 		   ControlFile->track_commit_timestamp ? _("on") : _("off"));
+	printf(_("enable_csn_snapshot setting:    	    %s\n"),
+		   ControlFile->enable_csn_snapshot ? 	 _("on") : _("off"));
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile->maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 3628bd74a7..18cf9197cc 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -548,6 +548,11 @@ copy_xact_xlog_xid(void)
 		check_ok();
 	}
 
+	if(old_cluster.controldata.cat_ver > CSN_BASE_SNAPSHOT_ADD_VER)
+	{
+		copy_subdir_files("pg_csn", "pg_csn");
+	}
+
 	/* now reset the wal archives in the new cluster */
 	prep_status("Resetting WAL archives");
 	exec_prog(UTILITY_LOG_FILE, NULL, true, true,
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index ca0795f68f..54f2984387 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -124,6 +124,8 @@ extern char *output_files[];
  */
 #define JSONB_FORMAT_CHANGE_CAT_VER 201409291
 
+#define	CSN_BASE_SNAPSHOT_ADD_VER	202002010
+
 
 /*
  * Each relation is represented by a relinfo structure.
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..2d280ce940 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -11,6 +11,7 @@
 #include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/generic_xlog.h"
 #include "access/ginxlog.h"
 #include "access/gistxlog.h"
diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h
new file mode 100644
index 0000000000..12df028bf4
--- /dev/null
+++ b/src/include/access/csn_log.h
@@ -0,0 +1,98 @@
+/*
+ * csn_log.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "utils/snapshot.h"
+#include "storage/sync.h"
+
+
+#define InProgressCSN	 	UINT64CONST(0x0)
+#define AbortedCSN	 		UINT64CONST(0x1)
+#define FrozenCSN		 	UINT64CONST(0x2)
+#define InDoubtCSN	 		UINT64CONST(0x3)
+#define UnclearCSN	 		UINT64CONST(0x4)
+#define FirstNormalCSN 		UINT64CONST(0x5)
+
+#define CSNIsInProgress(csn)	((csn) == InProgressCSN)
+#define CSNIsAborted(csn)		((csn) == AbortedCSN)
+#define CSNIsFrozen(csn)		((csn) == FrozenCSN)
+#define CSNIsInDoubt(csn)		((csn) == InDoubtCSN)
+#define CSNIsUnclear(csn)		((csn) == UnclearCSN)
+#define CSNIsNormal(csn)		((csn) >= FirstNormalCSN)
+
+/* XLOG stuff */
+#define XLOG_CSN_ASSIGNMENT			0x00
+#define XLOG_CSN_SETCSN				0x10
+#define XLOG_CSN_ZEROPAGE			0x20
+#define XLOG_CSN_TRUNCATE			0x30
+
+/*
+ * We should log MAX generated CSN to wal, so that database will not generate
+ * a historical CSN after database restart. This may appear when system time
+ * turned back.
+ *
+ * However we can not log the MAX CSN every time it generated, if so it will
+ * cause too many wal expend, so we log it 5s more in the future.
+ *
+ * As a trade off, when this database restart, there will be 5s bad performance
+ * for time synchronization among sharding nodes.
+ *
+ * It looks like we can redefine this as a configure parameter, and the user
+ * can decide which way they prefer.
+ *
+ */
+#define	CSN_ASSIGN_TIME_INTERVAL	5
+
+typedef struct xl_csn_set
+{
+	CSN				csn;
+	TransactionId	xtop;			/* XID's top-level XID */
+	int				nsubxacts;		/* number of subtransaction XIDs */
+	TransactionId	xsub[FLEXIBLE_ARRAY_MEMBER];	/* assigned subxids */
+} xl_csn_set;
+
+#define MinSizeOfCSNSet offsetof(xl_csn_set, xsub)
+#define	CSNAddByNanosec(csn,second) (csn + second * 1000000000L)
+
+/* Main functions */
+extern void CSNLogSetCSN(TransactionId xid, int nsubxids,
+							   TransactionId *subxids, CSN csn, bool write_xlog);
+extern CSN CSNLogGetCSNByXid(TransactionId xid);
+
+/* Infrastructure functions */
+extern Size CSNLogShmemSize(void);
+extern void CSNLogShmemInit(void);
+extern void ActivateCSNlog(void);
+extern void ExtendCSNLog(TransactionId newestXact);
+extern void DeactivateCSNlog(void);
+
+extern void CheckPointCSNLog(void);
+extern void TruncateCSNLog(TransactionId oldestXact);
+
+extern void csnlog_redo(XLogReaderState *record);
+extern void csnlog_desc(StringInfo buf, XLogReaderState *record);
+extern const char *csnlog_identify(uint8 info);
+extern void WriteAssignCSNXlogRec(CSN csn);
+extern void CatchCSNLog(void);
+extern void StartupCSN(void);
+extern void CompleteCSNInitialization(void);
+extern void CSNlogParameterChange(bool newvalue, bool oldvalue);
+extern bool get_csnlog_status(void);
+extern int csnsyncfiletag(const FileTag *ftag, char *path);
+
+extern CSN GenerateCSN(bool locked, CSN assign);
+extern CSN GetLastGeneratedCSN(void);
+
+extern TransactionId GetOldestXmin(void);
+
+#endif   /* CSNLOG_H */
\ No newline at end of file
diff --git a/src/include/access/csn_snapshot.h b/src/include/access/csn_snapshot.h
new file mode 100644
index 0000000000..916603af0c
--- /dev/null
+++ b/src/include/access/csn_snapshot.h
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.h
+ *	  Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_snapshot.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CSN_SNAPSHOT_H
+#define CSN_SNAPSHOT_H
+
+#include "access/csn_log.h"
+#include "port/atomics.h"
+#include "storage/lock.h"
+#include "utils/snapshot.h"
+#include "utils/guc.h"
+
+/*
+ * snapshot.h is used in frontend code so atomic variant of SnapshotCSN type
+ * is defined here.
+ */
+typedef pg_atomic_uint64 CSN_atomic;
+
+
+extern int csn_snapshot_defer_time;
+extern int csn_time_shift;
+
+
+extern Size CSNSnapshotShmemSize(void);
+extern void CSNSnapshotShmemInit(void);
+extern void CSNSnapshotStartup(TransactionId oldestActiveXID);
+
+extern void CSNSnapshotMapXmin(SnapshotCSN snapshot_csn);
+extern TransactionId CSNSnapshotToXmin(SnapshotCSN snapshot_csn);
+
+extern bool XidInCSNSnapshot(TransactionId xid, Snapshot snapshot);
+
+extern CSN TransactionIdGetCSN(TransactionId xid);
+
+extern void CSNSnapshotAbort(PGPROC *proc, TransactionId xid, int nsubxids,
+								TransactionId *subxids);
+extern void CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotCommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotAssignCurrent(SnapshotCSN snapshot_csn);
+extern SnapshotCSN CSNSnapshotPrepareCurrent(void);
+extern void CSNSnapshotSync(SnapshotCSN remote_csn);
+
+#endif							/* CSN_SNAPSHOT_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index f582cf535f..3cf0775176 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
 PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
 PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+PG_RMGR(RM_CSNLOG_ID, "CSN", csnlog_redo, csnlog_desc, csnlog_identify, NULL, NULL, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index c0da76cab4..2ee489dcad 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -242,6 +242,7 @@ typedef struct xl_parameter_change
 	int			wal_level;
 	bool		wal_log_hints;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 } xl_parameter_change;
 
 /* logs restore point */
@@ -332,5 +333,6 @@ extern bool ArchiveRecoveryRequested;
 extern bool InArchiveRecovery;
 extern bool StandbyMode;
 extern char *recoveryRestoreCommand;
+extern bool enable_csn_wal;
 
 #endif							/* XLOG_INTERNAL_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 749bce0cc6..a7da532f3a 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -183,6 +183,7 @@ typedef struct ControlFileData
 	int			max_prepared_xacts;
 	int			max_locks_per_xact;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d068d6532e..d578aceb40 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11689,4 +11689,21 @@
   prorettype => 'bytea', proargtypes => 'pg_brin_minmax_multi_summary',
   prosrc => 'brin_minmax_multi_summary_send' },
 
+# csn shnapshot handling
+{ oid => '10001', descr => 'export csn snapshot',
+  proname => 'pg_csn_snapshot_export', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_csn_snapshot_export' },
+{ oid => '10002', descr => 'import csn snapshot',
+  proname => 'pg_csn_snapshot_import', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'int8', prosrc => 'pg_csn_snapshot_import' },
+{ oid => '10003', descr => 'prepare distributed transaction for commit, get csn',
+  proname => 'pg_csn_snapshot_prepare', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => 'text', prosrc => 'pg_csn_snapshot_prepare' },
+{ oid => '10004', descr => 'assign csn to distributed transaction',
+  proname => 'pg_csn_snapshot_assign', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_csn_snapshot_assign' },
+{ oid => '10005', descr => 'get current CSN',
+  proname => 'pg_current_csn', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_current_csn' },
+
 ]
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 99873497a6..8d1ced7430 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -93,6 +93,9 @@ typedef struct
 #define USECS_PER_MINUTE INT64CONST(60000000)
 #define USECS_PER_SEC	INT64CONST(1000000)
 
+#define NSECS_PER_SEC	INT64CONST(1000000000)
+#define NSECS_PER_USEC	INT64CONST(1000)
+
 /*
  * We allow numeric timezone offsets up to 15:59:59 either way from Greenwich.
  * Currently, the record holders for wackiest offsets in actual use are zones
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index ab7b85c86e..f08999740b 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -281,6 +281,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_GETARG_FLOAT4(n)  DatumGetFloat4(PG_GETARG_DATUM(n))
 #define PG_GETARG_FLOAT8(n)  DatumGetFloat8(PG_GETARG_DATUM(n))
 #define PG_GETARG_INT64(n)	 DatumGetInt64(PG_GETARG_DATUM(n))
+#define PG_GETARG_UINT64(n)	 DatumGetUInt64(PG_GETARG_DATUM(n))
 /* use this if you want the raw, possibly-toasted input datum: */
 #define PG_GETARG_RAW_VARLENA_P(n)	((struct varlena *) PG_GETARG_POINTER(n))
 /* use this if you want the input datum de-toasted: */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 39a4f0600e..a78f0d284b 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,6 +141,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* !HAVE_CLOCK_GETTIME */
 
 /* Use gettimeofday() */
@@ -205,6 +208,10 @@ typedef struct timeval instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + \
+		(uint64) (t).tv_usec * (uint64) 1000)
+
 #endif							/* HAVE_CLOCK_GETTIME */
 
 #else							/* WIN32 */
@@ -237,6 +244,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a8f052e484..65d1e49fb2 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -168,6 +168,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFER,
 	LWTRANCHE_SUBTRANS_BUFFER,
+	LWTRANCHE_CSN_LOG_BUFFERS,
 	LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 	LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 	LWTRANCHE_NOTIFY_BUFFER,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index be67d8a861..ade5d8e169 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -15,12 +15,14 @@
 #define _PROC_H_
 
 #include "access/clog.h"
+#include "access/csn_snapshot.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
 #include "storage/latch.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "utils/snapshot.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
@@ -251,6 +253,18 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/*
+	 * assignedCSN holds CSN for this transaction.  It is generated
+	 * under a ProcArray lock and later is written to a CSNLog.  This
+	 * variable defined as atomic only for case of group commit, in all other
+	 * scenarios only backend responsible for this proc entry is working with
+	 * this variable.
+	 */
+	CSN_atomic assignedCSN;
+
+	/* Original xmin of this backend before csn snapshot was imported */
+	TransactionId originalXmin;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index b01fa52139..ba580435f9 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -20,6 +20,10 @@
 #include "utils/snapshot.h"
 
 
+#define		PROCARRAY_NON_IMPORTED_XMIN		0x80	/* use originalXmin instead
+													 * of xmin to properly
+													 * maintain csnXidMap */
+
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
 extern void ProcArrayAdd(PGPROC *proc);
@@ -94,4 +98,7 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
 
+extern void ProcArraySetCSNSnapshotXmin(TransactionId xmin);
+
+extern TransactionId ProcArrayGetCSNSnapshotXmin(void);
 #endif							/* PROCARRAY_H */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 6fd50cfa7b..eb1d52673a 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -39,6 +39,7 @@ typedef enum SyncRequestHandler
 	SYNC_HANDLER_COMMIT_TS,
 	SYNC_HANDLER_MULTIXACT_OFFSET,
 	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_CSN,
 	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index c6a176cc95..122eea20ba 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -41,10 +41,11 @@
   && !RelationIsAccessibleInLogicalDecoding(rel) \
 )
 
-#define EarlyPruningEnabled(rel) (old_snapshot_threshold >= 0 && RelationAllowsEarlyPruning(rel))
+#define EarlyPruningEnabled(rel) (old_snapshot_threshold >= 0 && !enable_csn_snapshot && RelationAllowsEarlyPruning(rel))
 
 /* GUC variables */
 extern PGDLLIMPORT int old_snapshot_threshold;
+extern PGDLLIMPORT bool enable_csn_snapshot;
 
 
 extern Size SnapMgrShmemSize(void);
@@ -100,7 +101,7 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 static inline bool
 OldSnapshotThresholdActive(void)
 {
-	return old_snapshot_threshold >= 0;
+	return (old_snapshot_threshold >= 0) && (!enable_csn_snapshot);
 }
 
 extern Snapshot GetTransactionSnapshot(void);
@@ -130,6 +131,8 @@ extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
 
+extern SnapshotCSN ExportCSNSnapshot(void);
+extern void ImportCSNSnapshot(SnapshotCSN snapshot_csn);
 extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 6b60755c53..3580a94c43 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -121,6 +121,9 @@ typedef enum SnapshotType
 typedef struct SnapshotData *Snapshot;
 
 #define InvalidSnapshot		((Snapshot) NULL)
+#define InvalidCSN			((CSN) 0)
+typedef uint64 CSN;
+typedef uint64 SnapshotCSN;
 
 /*
  * Struct representing all kind of possible snapshots.
@@ -214,6 +217,14 @@ typedef struct SnapshotData
 	 * transactions completed since the last GetSnapshotData().
 	 */
 	uint64		snapXactCompletionCount;
+
+	/*
+	 * SnapshotCSN for snapshot isolation support.
+	 * Will be used only if enable_csn_snapshot is enabled.
+	 */
+	SnapshotCSN	snapshot_csn;
+	/* Did we have our own snapshot_csn or imported one from different node */
+	bool		imported_csn;
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index dffc79b2d9..16bb65e7e1 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
 SUBDIRS = \
 		  brin \
 		  commit_ts \
+		  csnsnapshot \
 		  delay_execution \
 		  dummy_index_am \
 		  dummy_seclabel \
diff --git a/src/test/modules/csnsnapshot/Makefile b/src/test/modules/csnsnapshot/Makefile
new file mode 100644
index 0000000000..fa4245752b
--- /dev/null
+++ b/src/test/modules/csnsnapshot/Makefile
@@ -0,0 +1,25 @@
+# src/test/modules/csnsnapshot/Makefile
+
+REGRESS_OPTS = --temp-config=$(top_srcdir)/src/test/modules/csnsnapshot/csn_snapshot.conf
+NO_INSTALLCHECK = 1
+
+TAP_TESTS = 1
+EXTRA_INSTALL=contrib/postgres_fdw
+
+# Don't support full consistency of distributed commit in READ COMMITTED
+# transactions.
+#PROVE_TESTS =	t/001_base.pl \
+#				t/002_standby.pl \
+#				t/003_time_skew.pl
+PROVE_TESTS =					t/005_basic_visibility.pl
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/csnsnapshot
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/csnsnapshot/csn_snapshot.conf b/src/test/modules/csnsnapshot/csn_snapshot.conf
new file mode 100644
index 0000000000..e9d3c35756
--- /dev/null
+++ b/src/test/modules/csnsnapshot/csn_snapshot.conf
@@ -0,0 +1 @@
+track_commit_timestamp = on
diff --git a/src/test/modules/csnsnapshot/expected/csnsnapshot.out b/src/test/modules/csnsnapshot/expected/csnsnapshot.out
new file mode 100644
index 0000000000..ac28e417b6
--- /dev/null
+++ b/src/test/modules/csnsnapshot/expected/csnsnapshot.out
@@ -0,0 +1 @@
+create table t1(i int, j int, k varchar);
diff --git a/src/test/modules/csnsnapshot/t/001_base.pl b/src/test/modules/csnsnapshot/t/001_base.pl
new file mode 100644
index 0000000000..3b5a09000b
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/001_base.pl
@@ -0,0 +1,103 @@
+# Single-node test: value can be set, and is still present after recovery
+
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 5;
+use PostgresNode;
+
+my $node = PostgresNode->new('csntest');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					});
+$node->start;
+
+my $test_1 = 1;
+
+# Create a table
+$node->safe_psql('postgres', 'create table t1(i int, j int)');
+
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(1,1)');
+# export csn snapshot
+my $test_snapshot = $node->safe_psql('postgres', 'select pg_csn_snapshot_export()');
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(2,1)');
+
+my $count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '2', 'Get right number in normal query');
+my $count2 = $node->safe_psql('postgres', "
+			begin transaction isolation level repeatable read;
+			select pg_csn_snapshot_import($test_snapshot);
+			select count(*) from t1;
+			commit;"
+			);
+
+is($count2, '
+1', 'Get right number in csn import query');
+
+#prepare transaction test
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(3,1);
+						insert into t1 values(3,2);
+						prepare	transaction 'pt3';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(4,1);
+						insert into t1 values(4,2);
+						prepare	transaction 'pt4';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(5,1);
+						insert into t1 values(5,2);
+						prepare	transaction 'pt5';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(6,1);
+						insert into t1 values(6,2);
+						prepare	transaction 'pt6';
+						");
+$node->safe_psql('postgres', "commit prepared 'pt4';");
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = off");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(7,1);
+						insert into t1 values(7,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt3';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '8', 'Get right number in normal query');
+
+
+# restart with enable_csn_snapshot on
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(8,1);
+						insert into t1 values(8,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt5';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '12', 'Get right number in normal query');
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(9,1);
+						insert into t1 values(9,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt6';");
+
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '16', 'Get right number in normal query');
diff --git a/src/test/modules/csnsnapshot/t/002_standby.pl b/src/test/modules/csnsnapshot/t/002_standby.pl
new file mode 100644
index 0000000000..2b09712141
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/002_standby.pl
@@ -0,0 +1,66 @@
+# Test simple scenario involving a standby
+
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 6;
+use PostgresNode;
+
+my $bkplabel = 'backup';
+my $master   = PostgresNode->new('master');
+$master->init(allows_streaming => 1);
+
+$master->append_conf(
+	'postgresql.conf', qq{
+	enable_csn_snapshot = on
+	max_wal_senders = 5
+	});
+$master->start;
+$master->backup($bkplabel);
+
+my $standby = PostgresNode->new('standby');
+$standby->init_from_backup($master, $bkplabel, has_streaming => 1);
+$standby->start;
+
+$master->safe_psql('postgres', "create table t1(i int, j int)");
+
+my $guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'on', "GUC on master");
+
+my $guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'off', "GUC off master");
+
+$guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = on');
+$master->restart;
+
+my $count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '4096', "Ok for siwtch xid-base > csn-base"); #4096
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '8192', "Ok for switch csn-base > xid-base"); #8192
\ No newline at end of file
diff --git a/src/test/modules/csnsnapshot/t/003_time_skew.pl b/src/test/modules/csnsnapshot/t/003_time_skew.pl
new file mode 100644
index 0000000000..f2496ea883
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/003_time_skew.pl
@@ -0,0 +1,214 @@
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 13;
+use PostgresNode;
+
+my $node1 = PostgresNode->new('csn1');
+$node1->init;
+$node1->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					});
+$node1->start;
+my $node2 = PostgresNode->new('csn2');
+$node2->init;
+$node2->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					});
+$node2->start;
+
+$node1->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node2->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE TABLE summary(value int, ntrans int);
+	INSERT INTO summary (value, ntrans) VALUES (0, 0);
+");
+$node2->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node1->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE FOREIGN TABLE summary(value int, ntrans int) SERVER remote;
+");
+
+$node1->safe_psql('postgres', "
+	CREATE TABLE t (id int, payload int) PARTITION BY HASH(id);
+	CREATE TABLE t_1 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 0);
+	CREATE FOREIGN TABLE t_2 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 1) SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE TABLE t (id serial, payload int) PARTITION BY HASH(id);
+	CREATE FOREIGN TABLE t_1 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 0) SERVER remote;
+	CREATE TABLE t_2 PARTITION OF t FOR VALUES WITH (modulus 2, remainder 1);
+");
+
+$node1->safe_psql('postgres', "INSERT INTO t(id, payload) (SELECT gs.*, 1 FROM generate_series(1,100) AS gs)");
+$node2->safe_psql('postgres', "INSERT INTO t(id, payload) (SELECT gs.*, 2 FROM generate_series(101,200) AS gs)");
+my $count1 = $node1->safe_psql('postgres', "SELECT SUM(payload) FROM t");
+my $count2 = $node2->safe_psql('postgres', "SELECT SUM(payload) FROM t");
+is( (($count1 == 300) and ($count1 == $count2)), 1, 'Correct insert');
+
+# ##############################################################################
+#
+# Basic test. Check REPEATABLE READ anomaly.
+# ntrans is needed to control that some transactions were committed.
+#
+# ##############################################################################
+
+my $q1 = File::Temp->new();
+append_to_file($q1, q{
+	START TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+	UPDATE summary SET value = value + (SELECT SUM(payload) FROM t);
+	UPDATE summary SET value = value - (SELECT SUM(payload) FROM t);
+	UPDATE summary SET ntrans = ntrans + 1;
+	COMMIT;
+});
+my $q2 = File::Temp->new();
+append_to_file($q2, q{
+	BEGIN;
+	\set pl random(-100, 100)
+	\set id random(1, 200)
+	UPDATE t SET payload = :pl WHERE id = :id;
+	COMMIT;
+});
+
+my $seconds = 5;
+my $pgb_handle1;
+my $pgb_handle2;
+
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 5, -T => $seconds, -f => $q1, 'postgres' );
+$pgb_handle2 = $node2->pgbench_async(-n, -c => 1, -T => $seconds, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count1 = $node1->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+$count2 = $node2->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+my $ntrans = $node2->safe_psql('postgres', "SELECT SUM(ntrans) FROM summary");
+note("$count1, $count2, $ntrans");
+is( ( ($ntrans > 0) and ($count1 == 0) and ($count1 == $count2)), 1, 'Correct update');
+
+# ##############################################################################
+#
+# Test on 'snapshot too old'
+#
+# ##############################################################################
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 0;");
+$node2->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = -100");
+$node2->restart();
+
+# READ COMMITTED transactions ignores the time skew.
+$node2->psql('postgres', "UPDATE summary SET ntrans = 1");
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("$ntrans");
+is( $ntrans, 1, 'Read committed behavior if snapshot turn sour');
+
+# But REPEATABLE READ transactions isn't
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = +100");
+$node1->restart();
+my $err = '';
+$node2->psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 2; COMMIT;", stderr => \$err);
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("$ntrans");
+is( (($ntrans == 1) and (index($err, 'csn snapshot too old') != -1)), 1, 'Read committed can\'t update if snapshot turn sour');
+
+# ##############################################################################
+#
+# Test on issue #1:
+# 'xact confirmed as committed, so any following xact must see its effects'.
+#
+# ##############################################################################
+$node1->safe_psql('postgres', "delete from t");
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 5");
+$node2->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 0");
+$node1->restart();
+$node2->restart();
+
+my $st_sec; my $end_sec;
+my $time_diff;
+
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; INSERT INTO t VALUES(1,1), (3,1); COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 2, 'Slow node can see mix node data change');
+$ntrans = $node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 2, 'Fast node can see mix node data change');
+
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; INSERT INTO t VALUES(1,1); COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT count(*) FROM t; COMMIT;");
+is( $ntrans, 3, 'CURRENTLY FAILED:Data change to fast node on slow node, and slow node can see data change');
+
+# READ COMMITED mode ignores the time skew.
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 1");
+$ntrans = $node2->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("ntrans: $ntrans\n");
+is( $ntrans, 1, 'See committed values in the READ COMMITTED mode');
+
+# Access from the future
+$node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = ntrans + 1; COMMIT;");
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+note("ntrans: $ntrans\n");
+is( $ntrans, 1, 'Do not see values, committed in the future at the REPEATABLE READ mode');
+
+# But...
+$node1->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 0");
+$node2->safe_psql('postgres', "ALTER SYSTEM SET csn_time_shift = 5");
+$node1->restart();
+$node2->restart();
+
+# Check READ COMMITED mode
+$node2->safe_psql('postgres', "UPDATE summary SET ntrans = 2");
+$ntrans = $node1->safe_psql('postgres', "SELECT ntrans FROM summary");
+note("ntrans: $ntrans\n");
+is( $ntrans, 2, 'See committed values in the READ COMMITTED mode, step 2');
+
+# Node from the future will wait for a time before UPDATE table.
+($st_sec) = localtime();
+$node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 3; COMMIT;");
+($end_sec) = localtime(); $time_diff = $end_sec - $st_sec;
+$ntrans = $node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+note("ntrans: $ntrans, Test time: $time_diff seconds");
+is( ($ntrans == 3), 1, 'The test execution time correlates with the time offset.');
+
+# Node from the future will wait for a time before SELECT from a table.
+$node1->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; UPDATE summary SET ntrans = 4; COMMIT;");
+($st_sec) = localtime();
+$ntrans = $node2->safe_psql('postgres', "START TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT ntrans FROM summary; COMMIT;");
+($end_sec) = localtime(); $time_diff = $end_sec - $st_sec;
+note("ntrans: $ntrans, Test time: $time_diff seconds ($end_sec, $st_sec)");
+is( ($ntrans == 4), 1, 'See values, committed in the past. The test execution time correlates with the time offset.');
+
+$node1->safe_psql('postgres', "UPDATE summary SET ntrans = 0, value = 0");
+$q1 = File::Temp->new();
+append_to_file($q1, q{
+	UPDATE summary SET value = value + 1, ntrans = ntrans + 1;
+});
+$q2 = File::Temp->new();
+append_to_file($q2, q{
+	START TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+	UPDATE summary SET value = value + (SELECT SUM(ntrans) FROM summary);
+	UPDATE summary SET value = value - (SELECT SUM(ntrans) FROM summary);
+	COMMIT;
+});
+$seconds = 3;
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 1, -T => $seconds, -f => $q1, 'postgres' );
+$pgb_handle2 = $node2->pgbench_async(-n, -c => 1, -T => $seconds, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count1 = $node1->safe_psql('postgres', "SELECT SUM(value) FROM summary");
+$count2 = $node1->safe_psql('postgres', "SELECT SUM(ntrans) FROM summary");
+note("$count1, $count2");
+is( ( ($count1 > 0) and ($count1 == $count2)), 1, 'Skew test');
+
+$node1->stop();
+$node2->stop();
diff --git a/src/test/modules/csnsnapshot/t/004_read_committed.pl b/src/test/modules/csnsnapshot/t/004_read_committed.pl
new file mode 100644
index 0000000000..ba27536a7a
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/004_read_committed.pl
@@ -0,0 +1,97 @@
+use strict;
+use warnings;
+
+use TestLib;
+use Test::More tests => 2;
+use PostgresNode;
+
+my $node1 = PostgresNode->new('csn1');
+$node1->init;
+$node1->append_conf('postgresql.conf', qq{
+					max_prepared_transactions = 20
+					shared_preload_libraries = 'postgres_fdw'
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					postgres_fdw.use_csn_snapshots = true
+					csn_time_shift = 0
+					});
+$node1->start;
+my $node2 = PostgresNode->new('csn2');
+$node2->init;
+$node2->append_conf('postgresql.conf', qq{
+					max_prepared_transactions = 20
+					shared_preload_libraries = 'postgres_fdw'
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					postgres_fdw.use_csn_snapshots = true
+					csn_time_shift = 0
+					});
+$node2->start;
+
+# Create foreign servers
+$node1->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node2->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node1->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+");
+
+# Create sharded table
+$node1->safe_psql('postgres', "
+	CREATE TABLE dept1(name TEXT);
+	CREATE FOREIGN TABLE dept2 (name TEXT) SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE TABLE dept2(name TEXT);
+	CREATE FOREIGN TABLE dept1 (name TEXT) SERVER remote;
+	CREATE TABLE results(success_tx int);
+	INSERT INTO results (success_tx) VALUES (0);
+");
+
+# Fill the table
+$node1->safe_psql('postgres', "INSERT INTO dept1 (name) VALUES ('Jonathan')");
+$node1->safe_psql('postgres', "INSERT INTO dept2 (name) VALUES ('Hoshi')");
+$node2->safe_psql('postgres', "INSERT INTO dept1 (name) VALUES ('Leonard')");
+my $count1 = $node1->safe_psql('postgres', "SELECT count(*) FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a");
+my $count2 = $node2->safe_psql('postgres', "SELECT count(*) FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a");
+note("$count1, $count2");
+is( (($count1 == 3) and ($count1 == $count2)), 1, 'Correct insert');
+
+# Queries
+my $q1 = File::Temp->new();
+append_to_file($q1, q{
+	BEGIN;
+	SELECT count(*) AS cnt FROM dept1; \gset
+	\if :cnt > 0
+		INSERT INTO dept2 (SELECT * FROM dept1);
+		DELETE FROM dept1;
+	\else
+		INSERT INTO dept1 (SELECT * FROM dept2);
+		DELETE FROM dept2;
+	\endif
+
+	COMMIT;
+});
+my $q2 = File::Temp->new();
+append_to_file($q2, q{
+	SELECT count(*) AS cnt FROM ((SELECT * FROM dept1) UNION (SELECT * FROM dept2)) AS a; \gset
+	\if :cnt = 3
+		UPDATE results SET success_tx = success_tx + 1;
+	\endif
+});
+my $transactions = 1000;
+my $pgb_handle1 = $node1->pgbench_async(-n, -c => 1, -t => $transactions, -f => $q1, 'postgres' );
+my $pgb_handle2 = $node2->pgbench_async(-n, -c => 20, -t => $transactions, -f => $q2, 'postgres' );
+$node1->pgbench_await($pgb_handle1);
+$node2->pgbench_await($pgb_handle2);
+
+$count2 = $node2->safe_psql('postgres', "SELECT success_tx FROM results");
+note("$count2");
+is( $count2, 20*$transactions, 'Correct READ COMMITTED updates');
+
+$node1->stop();
+$node2->stop();
diff --git a/src/test/modules/csnsnapshot/t/005_basic_visibility.pl b/src/test/modules/csnsnapshot/t/005_basic_visibility.pl
new file mode 100644
index 0000000000..93fa348e7b
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/005_basic_visibility.pl
@@ -0,0 +1,181 @@
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More tests => 3;
+
+my $node1 = PostgreSQL::Test::Cluster->new('csn1');
+$node1->init;
+$node1->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 30
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					log_statement = none
+					default_transaction_isolation = 'REPEATABLE READ'
+					log_min_messages = LOG
+					});
+
+my $node2 = PostgreSQL::Test::Cluster->new('csn2');
+$node2->init;
+$node2->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 30
+					csn_time_shift = 0
+					shared_preload_libraries = 'postgres_fdw'
+					postgres_fdw.use_csn_snapshots = true
+					log_statement = none
+					default_transaction_isolation = 'REPEATABLE READ'
+					log_min_messages = LOG
+					});
+$node1->start;
+$node2->start;
+
+$node1->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node2->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE TABLE test(key int, x bigint) PARTITION BY LIST (key);
+	CREATE TABLE t1 PARTITION OF test FOR VALUES IN (1);
+	CREATE FOREIGN TABLE t2 PARTITION OF test FOR VALUES IN (2) SERVER remote;
+");
+$node2->safe_psql('postgres', "
+	CREATE EXTENSION postgres_fdw;
+	CREATE SERVER remote FOREIGN DATA WRAPPER postgres_fdw	OPTIONS (port '".$node1->port."');
+	CREATE USER MAPPING FOR PUBLIC SERVER remote;
+	CREATE TABLE test(key int, x bigint) PARTITION BY LIST (key);
+	CREATE FOREIGN TABLE t1 PARTITION OF test FOR VALUES IN (1) SERVER remote;
+	CREATE TABLE t2 PARTITION OF test FOR VALUES IN (2);
+");
+$node1->safe_psql('postgres', "
+	INSERT INTO test (key, x) VALUES (1, -1);
+	INSERT INTO test (key, x) VALUES (2, 1);
+");
+
+$node1->safe_psql('postgres', "VACUUM FULL");
+$node2->safe_psql('postgres', "VACUUM FULL");
+
+# ##############################################################################
+#
+# Tests
+#
+# ##############################################################################
+
+my $updates = File::Temp->new();
+append_to_file($updates, q{
+	BEGIN;
+		UPDATE test SET x = x + 1;
+		UPDATE test SET x = x - 1;
+	END;
+});
+
+my $local_update1 = File::Temp->new();
+append_to_file($local_update1, q{
+	BEGIN;
+		UPDATE t1 SET x = x + 1;
+		UPDATE t1 SET x = x - 1;
+	END;
+});
+my $local_update2 = File::Temp->new();
+append_to_file($local_update2, q{
+	BEGIN;
+		UPDATE t2 SET x = x + 1;
+		UPDATE t2 SET x = x - 1;
+	END;
+});
+
+my ($pgb_handle1, $pgb_handle2, $pgb_handle3, $sum1, $sum2, $errors, $selects, $started);
+my $test_time = 30;
+my $result;
+
+# ##############################################################################
+#
+# Concurrent local UPDATE and global SELECT
+#
+# ##############################################################################
+$errors = 0;
+$selects = 0;
+$started = time();
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 5, -T => $test_time, -f => $local_update1, 'postgres' );
+while (time() - $started < $test_time)
+{
+	$result = $node2->safe_psql('postgres', "
+		SELECT 'sum=' || sum(x), (SELECT x FROM t1), (SELECT x FROM t2)
+		FROM test;");
+
+	if ( index($result, "sum=0") < 0 )
+	{
+		diag("[$selects] Isolation error. result = [ $result ]");
+		$errors++;
+		$node1->stop();
+		$node2->stop();
+		exit(1);
+	}
+	$selects++;
+}
+$node1->pgbench_await($pgb_handle1);
+note("TOTAL: selects = $selects, errors = $errors");
+is($errors == 0, 1, 'Local updates');
+#exit(1);
+
+# ##############################################################################
+#
+# Global UPDATE and global SELECT
+#
+# ##############################################################################
+$errors = 0;
+$selects = 0;
+$started = time();
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 5, -T => $test_time, -f => $updates, 'postgres' );
+while (time() - $started < $test_time)
+{
+	$result = $node2->safe_psql('postgres', "
+		SELECT 'sum=' || sum(x), (SELECT x FROM t1), (SELECT x FROM t2)
+		FROM test;");
+
+	if ( index($result, "sum=0") < 0 )
+	{
+		diag("[$selects] Isolation error. result = [ $result ]");
+		$errors++;
+	}
+	$selects++;
+}
+$node1->pgbench_await($pgb_handle1);
+note("TOTAL: selects = $selects, errors = $errors");
+is($errors == 0, 1, 'Distributed updates');
+
+# ##############################################################################
+#
+# Local UPDATEs, global UPDATE and global SELECT
+#
+# ##############################################################################
+$errors = 0;
+$selects = 0;
+$started = time();
+$pgb_handle1 = $node1->pgbench_async(-n, -c => 2, -T => $test_time, -f => $updates, 'postgres' );
+$pgb_handle2 = $node2->pgbench_async(-n, -c => 2, -T => $test_time, -f => $local_update2, 'postgres' );
+$pgb_handle3 = $node1->pgbench_async(-n, -c => 2, -T => $test_time, -f => $local_update1, 'postgres' );
+while (time() - $started < $test_time)
+{
+	$sum1 = $node1->safe_psql('postgres', "SELECT sum(x) FROM test;");
+	$sum2 = $node2->safe_psql('postgres', "SELECT sum(x) FROM test;");
+
+	if ( ($sum1 ne 0) or ($sum2 ne 0) )
+	{
+		diag("[$selects] Isolation error. Sums = [ $sum1, $sum2 ]");
+		$errors++;
+	}
+	$selects++;
+}
+$node1->pgbench_await($pgb_handle1);
+$node1->pgbench_await($pgb_handle3);
+$node2->pgbench_await($pgb_handle2);
+note("TOTAL: selects = $selects, errors = $errors");
+is($errors == 0, 1, 'Mix of local and distributed updates');
+
+$node1->stop();
+$node2->stop();
diff --git a/src/test/modules/snapshot_too_old/sto.conf b/src/test/modules/snapshot_too_old/sto.conf
index 7eeaeeb0dc..3177cc0e15 100644
--- a/src/test/modules/snapshot_too_old/sto.conf
+++ b/src/test/modules/snapshot_too_old/sto.conf
@@ -1,2 +1,3 @@
 autovacuum = off
 old_snapshot_threshold = 0
+enable_csn_snapshot = false
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 2088857615..010b3b3144 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -104,6 +104,8 @@ select name, setting from pg_settings where name like 'enable%';
 --------------------------------+---------
  enable_async_append            | on
  enable_bitmapscan              | on
+ enable_csn_snapshot            | on
+ enable_csn_wal                 | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -122,7 +124,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(20 rows)
+(22 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
-- 
2.25.1

#56

Andrey V. Lepikhov

a.lepikhov@postgrespro.ru

about 4 years ago

In reply to: tsunakawa.takay@fujitsu.com (#54)

1 attachment(s)

Patch in the previous letter is full of faulties. Please, use new version.
Also, here we fixed the problem with loosing CSN value in a parallel
worker (TAP test 003_parallel_safe.pl). Thanks for a.pyhalov for the
problem detection and a bugfix.

--
regards,
Andrey Lepikhov
Postgres Professional

Attachments:

v2-0001-Add-Commit-Sequence-Number-CSN-machinery-into-MVCC.patchtext/x-patch; charset=UTF-8; name=v2-0001-Add-Commit-Sequence-Number-CSN-machinery-into-MVCC.patchDownload

From 7aa57724fc42b8ca7054f9b6edfa33c0cffb24bf Mon Sep 17 00:00:00 2001
From: Andrey Lepikhov <a.lepikhov@postgrespro.ru>
Date: Wed, 17 Nov 2021 11:13:37 +0500
Subject: [PATCH] Add Commit Sequence Number (CSN) machinery into MVCC
 implementation for a timestamp-based resolving of visibility conflicts.

It allows to achieve proper snapshot isolation semantics in the case
of distributed transactions involving more than one Postgres instance.

Authors: K.Knizhnik, S.Kelvich, A.Sher, A.Lepikhov, M.Usama.

Discussion:
(2020/05/21 -)
https://www.postgresql.org/message-id/flat/CA%2Bfd4k6HE8xLGEvqWzABEg8kkju5MxU%2Bif7bf-md0_2pjzXp9Q%40mail.gmail.com#ed1359340871688bed2e643921f73365
(2018/05/01 - 2019/04/21)
https://www.postgresql.org/message-id/flat/21BC916B-80A1-43BF-8650-3363CCDAE09C%40postgrespro.ru
---
 doc/src/sgml/config.sgml                      |  50 +-
 src/backend/access/rmgrdesc/Makefile          |   1 +
 src/backend/access/rmgrdesc/csnlogdesc.c      |  95 +++
 src/backend/access/rmgrdesc/xlogdesc.c        |   6 +-
 src/backend/access/transam/Makefile           |   2 +
 src/backend/access/transam/csn_log.c          | 748 ++++++++++++++++++
 src/backend/access/transam/csn_snapshot.c     | 687 ++++++++++++++++
 src/backend/access/transam/rmgr.c             |   1 +
 src/backend/access/transam/twophase.c         | 154 ++++
 src/backend/access/transam/varsup.c           |   2 +
 src/backend/access/transam/xact.c             |  32 +
 src/backend/access/transam/xlog.c             |  23 +-
 src/backend/access/transam/xloginsert.c       |   2 +
 src/backend/commands/vacuum.c                 |   3 +-
 src/backend/replication/logical/snapbuild.c   |   4 +
 src/backend/storage/ipc/ipci.c                |   6 +
 src/backend/storage/ipc/procarray.c           |  85 ++
 src/backend/storage/lmgr/lwlock.c             |   2 +
 src/backend/storage/lmgr/lwlocknames.txt      |   2 +
 src/backend/storage/lmgr/proc.c               |   6 +
 src/backend/storage/sync/sync.c               |   5 +
 src/backend/utils/misc/guc.c                  |  37 +
 src/backend/utils/probes.d                    |   2 +
 src/backend/utils/time/snapmgr.c              | 183 ++++-
 src/bin/initdb/initdb.c                       |   3 +-
 src/bin/pg_controldata/pg_controldata.c       |   2 +
 src/bin/pg_upgrade/pg_upgrade.c               |   5 +
 src/bin/pg_upgrade/pg_upgrade.h               |   2 +
 src/bin/pg_waldump/rmgrdesc.c                 |   1 +
 src/include/access/csn_log.h                  |  98 +++
 src/include/access/csn_snapshot.h             |  54 ++
 src/include/access/rmgrlist.h                 |   1 +
 src/include/access/xlog_internal.h            |   2 +
 src/include/catalog/pg_control.h              |   1 +
 src/include/catalog/pg_proc.dat               |  17 +
 src/include/datatype/timestamp.h              |   3 +
 src/include/fmgr.h                            |   1 +
 src/include/portability/instr_time.h          |  10 +
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |  14 +
 src/include/storage/procarray.h               |   7 +
 src/include/storage/sync.h                    |   1 +
 src/include/utils/snapmgr.h                   |   7 +-
 src/include/utils/snapshot.h                  |  11 +
 src/test/modules/Makefile                     |   1 +
 src/test/modules/csnsnapshot/Makefile         |  22 +
 .../csnsnapshot/expected/csnsnapshot.out      |   1 +
 src/test/modules/csnsnapshot/t/001_base.pl    | 100 +++
 src/test/modules/csnsnapshot/t/002_standby.pl |  68 ++
 .../csnsnapshot/t/003_parallel_safe.pl        |  67 ++
 src/test/modules/snapshot_too_old/sto.conf    |   1 +
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  28 +
 src/test/regress/expected/sysviews.out        |   4 +-
 53 files changed, 2660 insertions(+), 11 deletions(-)
 create mode 100644 src/backend/access/rmgrdesc/csnlogdesc.c
 create mode 100644 src/backend/access/transam/csn_log.c
 create mode 100644 src/backend/access/transam/csn_snapshot.c
 create mode 100644 src/include/access/csn_log.h
 create mode 100644 src/include/access/csn_snapshot.h
 create mode 100644 src/test/modules/csnsnapshot/Makefile
 create mode 100644 src/test/modules/csnsnapshot/expected/csnsnapshot.out
 create mode 100644 src/test/modules/csnsnapshot/t/001_base.pl
 create mode 100644 src/test/modules/csnsnapshot/t/002_standby.pl
 create mode 100644 src/test/modules/csnsnapshot/t/003_parallel_safe.pl

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 3f806740d5..f4f6c83fd0 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9682,8 +9682,56 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
      </varlistentry>
 
      </variablelist>
-   </sect1>
 
+    <sect2 id="runtime-config-CSN-based-snapshot">
+     <title>CSN Based Snapshot</title>
+      <para>
+       By default, snapshots in <productname>PostgreSQL</productname> contains a
+       XID (TransactionID) that allows to identify the status of a transaction
+       and make arbitrary visibility calculations.
+      </para>
+
+      <para>
+       <productname>PostgreSQL</productname> also provides a CSN (Commit
+        Sequence Number) based machinery as an additional tool for visibility
+        calculations. It may be used within distributed transactions when a xid of
+        a local transaction can't correctly identify order of the distributed one.
+      </para>
+
+     <variablelist>
+      <varlistentry id="guc-enable-csn-snapshot" xreflabel="enable_csn_snapshot">
+       <term><varname>enable_csn_snapshot</varname> (<type>boolean</type>)
+        <indexterm>
+         <primary><varname>enable_csn_snapshot</varname> configuration parameter</primary>
+        </indexterm>
+       </term>
+       <listitem>
+
+        <para>
+         Enable/disable the CSN tracking for the snapshot.
+        </para>
+
+        <para>
+        <productname>PostgreSQL</productname> uses a physical clock timestamp as
+        a CSN, so enabling the CSN based snapshots can be useful for implementing
+        cross-instance snapshots and visibility of distributed transaction.
+        </para>
+
+        <para>
+         when enabled <productname>PostgreSQL</productname> creates
+         <filename>pg_csn</filename> directory under <envar>PGDATA</envar> to keep
+         the track of CSN and XID mappings.
+        </para>
+
+        <para>
+         The default value is on.
+        </para>
+       </listitem>
+      </varlistentry>
+
+     </variablelist>
+    </sect2>
+   </sect1>
    <sect1 id="runtime-config-compatible">
     <title>Version and Platform Compatibility</title>
 
diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index f88d72fd86..15fc36f7b4 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -11,6 +11,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	brindesc.o \
 	clogdesc.o \
+	csnlogdesc.o \
 	committsdesc.o \
 	dbasedesc.o \
 	genericdesc.o \
diff --git a/src/backend/access/rmgrdesc/csnlogdesc.c b/src/backend/access/rmgrdesc/csnlogdesc.c
new file mode 100644
index 0000000000..f8c644e906
--- /dev/null
+++ b/src/backend/access/rmgrdesc/csnlogdesc.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * clogdesc.c
+ *	  rmgr descriptor routines for access/transam/csn_log.c
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/rmgrdesc/csnlogdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+
+
+void
+csnlog_desc(StringInfo buf, XLogReaderState *record)
+{
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		appendStringInfo(buf, "pageno %d", pageno);
+	}
+	else if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		appendStringInfo(buf, "assign "INT64_FORMAT"", csn);
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) rec;
+		int			  nsubxids;
+
+		appendStringInfo(buf, "set "INT64_FORMAT" for: %u",
+						 xlrec->csn,
+						 xlrec->xtop);
+		nsubxids = ((XLogRecGetDataLen(record) - MinSizeOfCSNSet) /
+					sizeof(TransactionId));
+		if (nsubxids > 0)
+		{
+			int			i;
+			TransactionId *subxids;
+
+			subxids = palloc(sizeof(TransactionId) * nsubxids);
+			memcpy(subxids,
+				   XLogRecGetData(record) + MinSizeOfCSNSet,
+				   sizeof(TransactionId) * nsubxids);
+			for (i = 0; i < nsubxids; i++)
+				appendStringInfo(buf, ", %u", subxids[i]);
+			pfree(subxids);
+		}
+	}
+}
+
+const char *
+csnlog_identify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_CSN_ASSIGNMENT:
+			id = "ASSIGNMENT";
+			break;
+		case XLOG_CSN_SETCSN:
+			id = "SETCSN";
+			break;
+		case XLOG_CSN_ZEROPAGE:
+			id = "ZEROPAGE";
+			break;
+		case XLOG_CSN_TRUNCATE:
+			id = "TRUNCATE";
+			break;
+	}
+
+	return id;
+}
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 5bf2346dd9..ea433046cf 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -113,7 +113,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		appendStringInfo(buf, "max_connections=%d max_worker_processes=%d "
 						 "max_wal_senders=%d max_prepared_xacts=%d "
 						 "max_locks_per_xact=%d wal_level=%s "
-						 "wal_log_hints=%s track_commit_timestamp=%s",
+						 "wal_log_hints=%s track_commit_timestamp=%s "
+						 "enable_csn_snapshot=%s",
 						 xlrec.MaxConnections,
 						 xlrec.max_worker_processes,
 						 xlrec.max_wal_senders,
@@ -121,7 +122,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 xlrec.max_locks_per_xact,
 						 wal_level_str,
 						 xlrec.wal_log_hints ? "on" : "off",
-						 xlrec.track_commit_timestamp ? "on" : "off");
+						 xlrec.track_commit_timestamp ? "on" : "off",
+						 xlrec.enable_csn_snapshot ? "on" : "off");
 	}
 	else if (info == XLOG_FPW_CHANGE)
 	{
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de72..fc0321ee6b 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,8 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	csn_log.o \
+	csn_snapshot.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c
new file mode 100644
index 0000000000..33517271ed
--- /dev/null
+++ b/src/backend/access/transam/csn_log.c
@@ -0,0 +1,748 @@
+/*-----------------------------------------------------------------------------
+ *
+ * csn_log.c
+ *		Track commit sequence numbers of finished transactions
+ *
+ * This module provides SLRU to store CSN for each transaction.  This
+ * mapping need to be kept only for xid's greater then oldestXid, but
+ * that can require arbitrary large amounts of memory in case of long-lived
+ * transactions.  Because of same lifetime and persistancy requirements
+ * this module is quite similar to subtrans.c
+ *
+ * If we switch database from CSN-base snapshot to xid-base snapshot then,
+ * nothing wrong. But if we switch xid-base snapshot to CSN-base snapshot
+ * it should decide a new xid which begin csn-base check. It can not be
+ * oldestActiveXID because of prepared transaction.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_log.h"
+#include "access/slru.h"
+#include "access/csn_snapshot.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "portability/instr_time.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/snapmgr.h"
+#include "access/xlog_internal.h"
+
+/*
+ * We use csnSnapshotActive to judge if csn snapshot enabled instead of by
+ * enable_csn_snapshot, this design is similar to 'track_commit_timestamp'.
+ *
+ * Because in process of replication if master changes 'enable_csn_snapshot'
+ * in a database restart, standby should apply wal record for GUC changed,
+ * then it's difficult to notice all backends about that. So they can get
+ * the message by 'csnSnapshotActive' which in shared buffer. It will not
+ * acquire a lock, so without performance issue.
+ * last_max_csn - Record the max csn till now.
+ * last_csn_log_wal - for interval we log the assign csn to wal
+ * oldestXmin - first sensible Xmin on the first existed page in the CSN Log
+ */
+typedef struct CSNShared
+{
+	bool				csnSnapshotActive;
+	pg_atomic_uint32	oldestXmin;
+	CSN					last_max_csn;
+	CSN					last_csn_log_wal;
+	volatile slock_t	lock;
+} CSNShared;
+
+CSNShared *csnShared;
+
+/*
+ * Defines for CSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/CSN_LOG_XACTS_PER_PAGE, and CSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCSNLog (see CSNLogPagePrecedes).
+ */
+
+/* We store the commit CSN for each xid */
+#define CSN_LOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CSN))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData CSNLogCtlData;
+#define CsnlogCtl (&CSNLogCtlData)
+
+static int	ZeroCSNLogPage(int pageno, bool write_xlog);
+static void ZeroTruncateCSNLogPage(int pageno, bool write_xlog);
+static bool CSNLogPagePrecedes(int page1, int page2);
+static void CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+									  TransactionId *subxids,
+									  CSN csn, int pageno);
+static void CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno);
+
+static void WriteCSNXlogRec(TransactionId xid, int nsubxids,
+							TransactionId *subxids, CSN csn);
+static void WriteZeroCSNPageXlogRec(int pageno);
+static void WriteTruncateCSNXlogRec(int pageno);
+static void set_oldest_xmin(TransactionId xid);
+
+
+/*
+ * Number of shared CSNLog buffers.
+ */
+static Size
+CSNLogShmemBuffers(void)
+{
+	return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for CsnlogCtl.
+ */
+Size
+CSNLogShmemSize(void)
+{
+	return SimpleLruShmemSize(CSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for CSNLog.
+ */
+void
+CSNLogShmemInit(void)
+{
+	bool		found;
+
+	CsnlogCtl->PagePrecedes = CSNLogPagePrecedes;
+	SimpleLruInit(CsnlogCtl, "CSNLog Ctl", CSNLogShmemBuffers(), 0,
+				  CSNLogSLRULock, "pg_csn", LWTRANCHE_CSN_LOG_BUFFERS,
+				  SYNC_HANDLER_CSN);
+
+	csnShared = ShmemInitStruct("CSNlog shared",
+									 sizeof(CSNShared),
+									 &found);
+	if (!found)
+	{
+		csnShared->csnSnapshotActive = false;
+		pg_atomic_init_u32(&csnShared->oldestXmin, InvalidTransactionId);
+		csnShared->last_max_csn = InvalidCSN;
+		csnShared->last_csn_log_wal = InvalidCSN;
+		SpinLockInit(&csnShared->lock);
+	}
+}
+
+/*
+ * CSNLogSetCSN
+ *
+ * Record CSN of transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transactionid for a top level commit or abort. It can also be a
+ * subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * AbortedCSN for abort cases.
+ */
+void
+CSNLogSetCSN(TransactionId xid, int nsubxids, TransactionId *subxids, CSN csn,
+			 bool write_xlog)
+{
+	int pageno;
+	int i = 0;
+	int offset = 0;
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);		/* get page of parent */
+
+	if(write_xlog)
+		WriteCSNXlogRec(xid, nsubxids, subxids, csn);
+
+	for (;;)
+	{
+		int num_on_page = 0;
+
+		/* Form subtransactions bucket that can be written on the same page */
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		CSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							csn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+CSNLogSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids,
+					CSN csn, int pageno)
+{
+	int slotno;
+	int i;
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		CSNLogSetCSNInSlot(subxids[i], csn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		CSNLogSetCSNInSlot(xid, csn, slotno);
+
+	CsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(CSNLogSLRULock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+CSNLogSetCSNInSlot(TransactionId xid, CSN csn, int slotno)
+{
+	int entryno = TransactionIdToPgIndex(xid);
+	CSN *ptr;
+
+	Assert(LWLockHeldByMe(CSNLogSLRULock));
+
+	ptr = (CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetCSN() in csn_snapshot.c is the
+ * intended caller.
+ */
+CSN
+CSNLogGetCSNByXid(TransactionId xid)
+{
+	int pageno = TransactionIdToPage(xid);
+	int entryno = TransactionIdToPgIndex(xid);
+	int slotno;
+	CSN csn;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid);
+	csn = *(CSN *) (CsnlogCtl->shared->page_buffer[slotno] +
+														entryno * sizeof(CSN));
+	LWLockRelease(CSNLogSLRULock);
+
+	return csn;
+}
+
+/*
+ * Initialize (or reinitialize) a page of CSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCSNLogPage(int pageno, bool write_xlog)
+{
+	Assert(LWLockHeldByMe(CSNLogSLRULock));
+	if(write_xlog)
+		WriteZeroCSNPageXlogRec(pageno);
+	return SimpleLruZeroPage(CsnlogCtl, pageno);
+}
+
+static void
+ZeroTruncateCSNLogPage(int pageno, bool write_xlog)
+{
+	if(write_xlog)
+		WriteTruncateCSNXlogRec(pageno);
+	SimpleLruTruncate(CsnlogCtl, pageno);
+}
+
+void
+ActivateCSNlog(void)
+{
+	int				pageno;
+	TransactionId	nextXid = InvalidTransactionId;
+	TransactionId	oldest_xid = InvalidTransactionId;
+
+	if (csnShared->csnSnapshotActive)
+		return;
+
+	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	pageno = TransactionIdToPage(nextXid);
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * Create the current segment file, if necessary.
+	 * This means that
+	 */
+	if (!SimpleLruDoesPhysicalPageExist(CsnlogCtl, pageno))
+	{
+		int slotno;
+		TransactionId curxid = nextXid;
+
+		slotno = ZeroCSNLogPage(pageno, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+
+		elog(LOG, "Create SLRU page=%d, slotno=%d for xid %u on a CSN log activation",
+			 pageno, slotno, nextXid);
+
+		/*
+		 * nextXid isn't first xid on the page. It is the first page in the CSN
+		 * log. Set UnclearCSN value into all previous slots on this page.
+		 * This xid value can be used as an oldest xid in the CSN log.
+		 */
+		if (TransactionIdToPgIndex(nextXid) > 0)
+		{
+			/* Cleaning procedure. Can be optimized. */
+			do
+			{
+				curxid--;
+				CSNLogSetCSNInSlot(curxid, UnclearCSN, slotno);
+			} while (TransactionIdToPgIndex(curxid) > 0);
+
+			elog(LOG,
+				 "Set UnclearCSN values for %d xids in the range [%u,%u]",
+				 nextXid - curxid, curxid, nextXid-1);
+
+			/* Oldest XID found on this page */
+			oldest_xid = nextXid;
+		}
+	}
+	LWLockRelease(CSNLogSLRULock);
+
+	if (!TransactionIdIsValid(oldest_xid))
+	{
+		TransactionId curxid;
+
+		elog(LOG, "Search for the oldest xid across previous pages");
+
+		/* Need to scan previous pages for an oldest xid. */
+		while (pageno > 0 && SimpleLruDoesPhysicalPageExist(CsnlogCtl, pageno - 1))
+			pageno--;
+
+		/* look up for the first clear xid value. */
+		curxid = pageno * (TransactionId) CSN_LOG_XACTS_PER_PAGE;
+		while(CSNLogGetCSNByXid(curxid) == UnclearCSN)
+			curxid++;
+		oldest_xid = curxid;
+	}
+
+	set_oldest_xmin(oldest_xid);
+	csnShared->csnSnapshotActive = true;
+}
+
+bool
+get_csnlog_status(void)
+{
+	return csnShared->csnSnapshotActive;
+}
+
+void
+DeactivateCSNlog(void)
+{
+	csnShared->csnSnapshotActive = false;
+	set_oldest_xmin(InvalidTransactionId);
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+	(void) SlruScanDirectory(CsnlogCtl, SlruScanDirCbDeleteAll, NULL);
+	LWLockRelease(CSNLogSLRULock);
+	elog(LOG, "CSN log has deactivated");
+}
+
+void
+StartupCSN(void)
+{
+	ActivateCSNlog();
+}
+
+void
+CompleteCSNInitialization(void)
+{
+	/*
+	 * If the feature is not enabled, turn it off for good.  This also removes
+	 * any leftover data.
+	 *
+	 * Conversely, we activate the module if the feature is enabled.  This is
+	 * necessary for primary and standby as the activation depends on the
+	 * control file contents at the beginning of recovery or when a
+	 * XLOG_PARAMETER_CHANGE is replayed.
+	 */
+	if (!enable_csn_snapshot)
+		DeactivateCSNlog();
+	else
+		ActivateCSNlog();
+}
+
+void
+CSNlogParameterChange(bool newvalue, bool oldvalue)
+{
+	if (newvalue)
+	{
+		if (!csnShared->csnSnapshotActive)
+			ActivateCSNlog();
+	}
+	else if (csnShared->csnSnapshotActive)
+		DeactivateCSNlog();
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCSNLog(void)
+{
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * Flush dirty CSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true);
+	SimpleLruWriteAll(CsnlogCtl, true);
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that CSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCSNLog(TransactionId newestXact)
+{
+	int			pageno;
+
+	if (!get_csnlog_status())
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCSNLogPage(pageno, !InRecovery);
+
+	LWLockRelease(CSNLogSLRULock);
+}
+
+/*
+ * Remove all CSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateCSNLog(TransactionId oldestXact)
+{
+	int				cutoffPage;
+	TransactionId	oldestXmin;
+
+	/* Can't do truncation because WAL messages isn't allowed during recovery */
+	if (RecoveryInProgress() || !get_csnlog_status())
+		return;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	/* Detect, that we really need to cut CSN log. */
+	oldestXmin = pg_atomic_read_u32(&csnShared->oldestXmin);
+
+	if (TransactionIdToPage(oldestXmin) < cutoffPage)
+	{
+		/* OldestXact is located in the same page as oldestXmin. No actions needed. */
+		return;
+	}
+
+	/*
+	 * Shift oldestXmin to the start of new first page. Use first position
+	 * on the page because all transactions on this page is created with enabled
+	 * CSN snapshot machinery.
+	 */
+	pg_atomic_write_u32(&csnShared->oldestXmin,
+						oldestXact - TransactionIdToPgIndex(oldestXact));
+
+	SpinLockRelease(&csnShared->lock);
+	ZeroTruncateCSNLogPage(cutoffPage, true);
+}
+
+/*
+ * Decide which of two CSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+CSNLogPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * CSN_LOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * CSN_LOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
+
+void
+WriteAssignCSNXlogRec(CSN csn)
+{
+	Assert(enable_csn_wal && csn <= csnShared->last_csn_log_wal);
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&csn), sizeof(CSN));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ASSIGNMENT);
+}
+
+static void
+WriteCSNXlogRec(TransactionId xid, int nsubxids,
+				TransactionId *subxids, CSN csn)
+{
+	xl_csn_set xlrec;
+
+	if(!enable_csn_wal)
+		return;
+
+	xlrec.xtop = xid;
+	xlrec.nsubxacts = nsubxids;
+	xlrec.csn = csn;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, MinSizeOfCSNSet);
+	XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_SETCSN);
+}
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroCSNPageXlogRec(int pageno)
+{
+	if(!enable_csn_wal)
+	{
+		return;
+	}
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_CSNLOG_ID, XLOG_CSN_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ */
+static void
+WriteTruncateCSNXlogRec(int pageno)
+{
+	if(!enable_csn_wal)
+	{
+		return;
+	}
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	XLogInsert(RM_CSNLOG_ID, XLOG_CSN_TRUNCATE);
+}
+
+
+void
+csnlog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in csnlog records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_CSN_ASSIGNMENT)
+	{
+		CSN csn;
+
+		memcpy(&csn, XLogRecGetData(record), sizeof(CSN));
+		/* XXX: Do we really not needed to acquire the lock here? */
+		csnShared->last_max_csn = csn;
+	}
+	else if (info == XLOG_CSN_SETCSN)
+	{
+		xl_csn_set *xlrec = (xl_csn_set *) XLogRecGetData(record);
+		CSNLogSetCSN(xlrec->xtop, xlrec->nsubxacts, xlrec->xsub, xlrec->csn, false);
+	}
+	else if (info == XLOG_CSN_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		LWLockAcquire(CSNLogSLRULock, LW_EXCLUSIVE);
+		slotno = ZeroCSNLogPage(pageno, false);
+		SimpleLruWritePage(CsnlogCtl, slotno);
+		LWLockRelease(CSNLogSLRULock);
+		Assert(!CsnlogCtl->shared->page_dirty[slotno]);
+
+	}
+	else if (info == XLOG_CSN_TRUNCATE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		CsnlogCtl->shared->latest_page_number = pageno;
+		ZeroTruncateCSNLogPage(pageno, false);
+	}
+	else
+		elog(PANIC, "csnlog_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+csnsyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(&CSNLogCtlData, ftag, path);
+}
+
+/*
+ * GenerateCSN
+ *
+ * Generate CSN which is actually a local time. Also we are forcing
+ * this time to be always increasing. Since now it is not uncommon to have
+ * millions of read transactions per second we are trying to use nanoseconds
+ * if such time resolution is available.
+ */
+CSN
+GenerateCSN(bool locked, CSN assign)
+{
+	instr_time	current_time;
+	CSN	csn;
+	CSN log_csn = InvalidCSN;
+
+	Assert(get_csnlog_status() || csn_snapshot_defer_time > 0);
+
+	/* TODO: create some macro that add small random shift to current time. */
+	INSTR_TIME_SET_CURRENT(current_time);
+	csn = (CSN) INSTR_TIME_GET_NANOSEC(current_time) + (int64) (csn_time_shift * 1E9);
+
+	if(assign != InvalidCSN && csn < assign)
+		csn = assign;
+
+	/* TODO: change to atomics? */
+	if (!locked)
+		SpinLockAcquire(&csnShared->lock);
+
+	if (csn <= csnShared->last_max_csn)
+		csn = csnShared->last_max_csn + 1;
+	csnShared->last_max_csn = csn;
+
+	if (enable_csn_wal && csn > csnShared->last_csn_log_wal)
+	{
+		/*
+		 * We log the CSN 5s greater than generated, you can see comments on
+		 * the CSN_ASSIGN_TIME_INTERVAL.
+		 */
+		log_csn = CSNAddByNanosec(csn, CSN_ASSIGN_TIME_INTERVAL);
+		csnShared->last_csn_log_wal = log_csn;
+	}
+
+	if (!locked)
+		SpinLockRelease(&csnShared->lock);
+
+	if (log_csn != InvalidCSN)
+		WriteAssignCSNXlogRec(csn);
+
+	return csn;
+}
+
+CSN
+GetLastGeneratedCSN(void)
+{
+	CSN csn;
+
+	SpinLockAcquire(&csnShared->lock);
+	csn = csnShared->last_max_csn;
+	SpinLockRelease(&csnShared->lock);
+	return csn;
+}
+
+/*
+ * Mostly for debug purposes.
+ */
+static void
+set_oldest_xmin(TransactionId xid)
+{
+	elog(LOG, "Oldest Xmin for CSN will be changed from %u to %u",
+		 pg_atomic_read_u32(&csnShared->oldestXmin), xid);
+
+	pg_atomic_write_u32(&csnShared->oldestXmin, xid);
+}
+
+TransactionId
+GetOldestXmin(void)
+{
+	Assert(get_csnlog_status());
+	return pg_atomic_read_u32(&csnShared->oldestXmin);
+}
diff --git a/src/backend/access/transam/csn_snapshot.c b/src/backend/access/transam/csn_snapshot.c
new file mode 100644
index 0000000000..a381d219ea
--- /dev/null
+++ b/src/backend/access/transam/csn_snapshot.c
@@ -0,0 +1,687 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.c
+ *		Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_snapshot.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csn_snapshot.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "portability/instr_time.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+#include "miscadmin.h"
+
+/* Raise a warning if imported snapshot_csn exceeds ours by this value. */
+#define SNAP_DESYNC_COMPLAIN (1*NSECS_PER_SEC) /* 1 second */
+
+static TransactionId xmin_for_csn = InvalidTransactionId;
+
+
+/*
+ * GUC to delay advance of oldestXid for this amount of time. Also determines
+ * the size CSNSnapshotXidMap circular buffer.
+ */
+int csn_snapshot_defer_time;
+
+int csn_time_shift;
+
+/*
+ * CSNSnapshotXidMap
+ *
+ * To be able to install csn snapshot that points to past we need to keep
+ * old versions of tuples and therefore delay advance of oldestXid.  Here we
+ * keep track of correspondence between snapshot's snapshot_csn and oldestXid
+ * that was set at the time when the snapshot was taken.  Much like the
+ * snapshot too old's OldSnapshotControlData does, but with finer granularity
+ * to seconds.
+ *
+ * Different strategies can be employed to hold oldestXid (e.g. we can track
+ * oldest csn-based snapshot among cluster nodes and map it oldestXid
+ * on each node).
+ *
+ * On each snapshot acquisition CSNSnapshotMapXmin() is called and stores
+ * correspondence between current snapshot_csn and oldestXmin in a sparse way:
+ * snapshot_csn is rounded to seconds (and here we use the fact that snapshot_csn
+ * is just a timestamp) and oldestXmin is stored in the circular buffer where
+ * rounded snapshot_csn acts as an offset from current circular buffer head.
+ * Size of the circular buffer is controlled by csn_snapshot_defer_time GUC.
+ *
+ * When csn snapshot arrives we check that its
+ * snapshot_csn is still in our map, otherwise we'll error out with "snapshot too
+ * old" message.  If snapshot_csn is successfully mapped to oldestXid we move
+ * backend's pgxact->xmin to proc->originalXmin and fill pgxact->xmin to
+ * mapped oldestXid.  That way GetOldestXmin() can take into account backends
+ * with imported csn snapshot and old tuple versions will be preserved.
+ *
+ * Also while calculating oldestXmin for our map in presence of imported
+ * csn snapshots we should use proc->originalXmin instead of pgxact->xmin
+ * that was set during import.  Otherwise, we can create a feedback loop:
+ * xmin's of imported csn snapshots were calculated using our map and new
+ * entries in map going to be calculated based on that xmin's, and there is
+ * a risk to stuck forever with one non-increasing oldestXmin.  All other
+ * callers of GetOldestXmin() are using pgxact->xmin so the old tuple versions
+ * are preserved.
+ */
+typedef struct CSNSnapshotXidMap
+{
+	int				 head;				/* offset of current freshest value */
+	int				 size;				/* total size of circular buffer */
+	CSN_atomic		 last_csn_seconds;	/* last rounded csn that changed
+										 * xmin_by_second[] */
+	TransactionId   *xmin_by_second;	/* circular buffer of oldestXmin's */
+}
+CSNSnapshotXidMap;
+
+static CSNSnapshotXidMap *csnXidMap;
+
+
+/* Estimate shared memory space needed */
+Size
+CSNSnapshotShmemSize(void)
+{
+	Size	size = 0;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		size += sizeof(CSNSnapshotXidMap);
+		size += csn_snapshot_defer_time*sizeof(TransactionId);
+		size = MAXALIGN(size);
+	}
+
+	return size;
+}
+
+/* Init shared memory structures */
+void
+CSNSnapshotShmemInit()
+{
+	bool found;
+
+	if (csn_snapshot_defer_time > 0)
+	{
+		csnXidMap = ShmemInitStruct("csnXidMap",
+								   sizeof(CSNSnapshotXidMap),
+								   &found);
+		if (!found)
+		{
+			int i;
+
+			pg_atomic_init_u64(&csnXidMap->last_csn_seconds, 0);
+			csnXidMap->head = 0;
+			csnXidMap->size = csn_snapshot_defer_time;
+			csnXidMap->xmin_by_second =
+							ShmemAlloc(sizeof(TransactionId)*csnXidMap->size);
+
+			for (i = 0; i < csnXidMap->size; i++)
+				csnXidMap->xmin_by_second[i] = InvalidTransactionId;
+		}
+	}
+}
+
+/*
+ * CSNSnapshotStartup
+ *
+ * Set csnXidMap entries to oldestActiveXID during startup.
+ */
+void
+CSNSnapshotStartup(TransactionId oldestActiveXID)
+{
+	/*
+	 * Run only if we have initialized shared memory and csnXidMap
+	 * is enabled.
+	 */
+	if (IsNormalProcessingMode() &&
+		enable_csn_snapshot && csn_snapshot_defer_time > 0)
+	{
+		int i;
+
+		Assert(TransactionIdIsValid(oldestActiveXID));
+		for (i = 0; i < csnXidMap->size; i++)
+			csnXidMap->xmin_by_second[i] = oldestActiveXID;
+		ProcArraySetCSNSnapshotXmin(oldestActiveXID);
+
+		elog(LOG, "CSN map initialized with oldest active xid %u", oldestActiveXID);
+	}
+}
+
+/*
+ * CSNSnapshotMapXmin
+ *
+ * Maintain circular buffer of oldestXmins for several seconds in past. This
+ * buffer allows to shift oldestXmin in the past when backend is importing
+ * CSN snapshot. Otherwise old versions of tuples that were needed for
+ * this transaction can be recycled by other processes (vacuum, HOT, etc).
+ *
+ * Locking here is not trivial. Called upon each snapshot creation after
+ * ProcArrayLock is released. Such usage creates several race conditions. It
+ * is possible that backend who got csn called CSNSnapshotMapXmin()
+ * only after other backends managed to get snapshot and complete
+ * CSNSnapshotMapXmin() call, or even committed. This is safe because
+ *
+ *		* We already hold our xmin in MyPgXact, so our snapshot will not be
+ *		  harmed even though ProcArrayLock is released.
+ *
+ *		* snapshot_csn is always pessmistically rounded up to the next
+ *		  second.
+ *
+ *		* For performance reasons, xmin value for particular second is filled
+ *		  only once. Because of that instead of writing to buffer just our
+ *		  xmin (which is enough for our snapshot), we bump oldestXmin there --
+ *		  it mitigates the possibility of damaging someone else's snapshot by
+ *		  writing to the buffer too advanced value in case of slowness of
+ *		  another backend who generated csn earlier, but didn't manage to
+ *		  insert it before us.
+ *
+ *		* if CSNSnapshotMapXmin() founds a gap in several seconds between
+ *		  current call and latest completed call then it should fill that gap
+ *		  with latest known values instead of new one. Otherwise it is
+ *		  possible (however highly unlikely) that this gap also happend
+ *		  between taking snapshot and call to CSNSnapshotMapXmin() for some
+ *		  backend. And we are at risk to fill circullar buffer with
+ *		  oldestXmin's that are bigger then they actually were.
+ */
+void
+CSNSnapshotMapXmin(SnapshotCSN snapshot_csn)
+{
+	int offset, gap, i;
+	SnapshotCSN csn_seconds;
+	SnapshotCSN last_csn_seconds;
+	volatile TransactionId oldest_deferred_xmin;
+	TransactionId current_oldest_xmin, previous_oldest_xmin;
+	TransactionId ImportedXmin;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+	/*
+	 * Round up snapshot_csn to the next second -- pessimistically and safely.
+	 */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC + 1);
+
+	/*
+	 * Fast-path check. Avoid taking exclusive CSNSnapshotXidMapLock lock
+	 * if oldestXid was already written to xmin_by_second[] for this rounded
+	 * snapshot_csn.
+	 */
+	if (pg_atomic_read_u64(&csnXidMap->last_csn_seconds) >= csn_seconds)
+		return;
+
+	/* Ok, we have new entry (or entries) */
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_EXCLUSIVE);
+
+	/* Re-check last_csn_seconds under lock */
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (last_csn_seconds >= csn_seconds)
+	{
+		LWLockRelease(CSNSnapshotXidMapLock);
+		return;
+	}
+	pg_atomic_write_u64(&csnXidMap->last_csn_seconds, csn_seconds);
+
+	/*
+	 * Count oldest_xmin.
+	 *
+	 * It was possible to calculate oldest_xmin during corresponding snapshot
+	 * creation, but GetSnapshotData() intentionally reads only PgXact, but not
+	 * PgProc. And we need info about originalXmin (see comment to csnXidMap)
+	 * which is stored in PgProc because of threats in comments around PgXact
+	 * about extending it with new fields. So just calculate oldest_xmin again,
+	 * that anyway happens quite rarely.
+	 */
+
+	/*
+	 * Don't afraid here because csn_snapshot_xmin will hold border of
+	 * minimal non-removable from vacuuming.
+	 */
+	ImportedXmin = MyProc->xmin;
+	MyProc->xmin = MyProc->originalXmin;
+	current_oldest_xmin = GetOldestNonRemovableTransactionId(NULL);
+	MyProc->xmin = ImportedXmin;
+	Assert(TransactionIdIsNormal(current_oldest_xmin));
+
+	previous_oldest_xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	Assert(TransactionIdIsNormal(previous_oldest_xmin) || !enable_csn_snapshot);
+
+	gap = csn_seconds - last_csn_seconds;
+	offset = csn_seconds % csnXidMap->size;
+
+	/* Sanity check before we update head and gap */
+	Assert( gap >= 1 );
+	Assert( (csnXidMap->head + gap) % csnXidMap->size == offset );
+
+	gap = gap > csnXidMap->size ? csnXidMap->size : gap;
+	csnXidMap->head = offset;
+
+	/* Fill new entry with current_oldest_xmin */
+	csnXidMap->xmin_by_second[offset] = current_oldest_xmin;
+
+	/*
+	 * If we have gap then fill it with previous_oldest_xmin for reasons
+	 * outlined in comment above this function.
+	 */
+	for (i = 1; i < gap; i++)
+	{
+		offset = (offset + csnXidMap->size - 1) % csnXidMap->size;
+		csnXidMap->xmin_by_second[offset] = previous_oldest_xmin;
+	}
+
+	oldest_deferred_xmin =
+		csnXidMap->xmin_by_second[ (csnXidMap->head + 1) % csnXidMap->size ];
+
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	elog(DEBUG5, "Advance xmin for CSN. Oldest deferred xmin = %u",
+		 oldest_deferred_xmin);
+
+	/*
+	 * Advance procArray->csn_snapshot_xmin after we released
+	 * CSNSnapshotXidMapLock. Since we gather not xmin but oldestXmin, it
+	 * never goes backwards regardless of how slow we can do that.
+	 */
+	/*Assert(TransactionIdFollowsOrEquals(oldest_deferred_xmin,
+										ProcArrayGetCSNSnapshotXmin()));*/
+	ProcArraySetCSNSnapshotXmin(oldest_deferred_xmin);
+}
+
+
+/*
+ * CSNSnapshotToXmin
+ *
+ * Get oldestXmin that took place when snapshot_csn was taken.
+ */
+TransactionId
+CSNSnapshotToXmin(SnapshotCSN snapshot_csn)
+{
+	TransactionId xmin;
+	SnapshotCSN csn_seconds;
+	volatile SnapshotCSN last_csn_seconds;
+
+	/* Callers should check config values */
+	Assert(csn_snapshot_defer_time > 0);
+	Assert(csnXidMap != NULL);
+
+	/* Round down to get conservative estimates */
+	csn_seconds = (snapshot_csn / NSECS_PER_SEC);
+
+	LWLockAcquire(CSNSnapshotXidMapLock, LW_SHARED);
+	last_csn_seconds = pg_atomic_read_u64(&csnXidMap->last_csn_seconds);
+	if (csn_seconds > last_csn_seconds)
+	{
+		/* we don't have entry for this snapshot_csn yet, return latest known */
+		xmin = csnXidMap->xmin_by_second[csnXidMap->head];
+	}
+	else if (last_csn_seconds - csn_seconds < csnXidMap->size)
+	{
+		/* we are good, retrieve value from our map */
+		Assert(last_csn_seconds % csnXidMap->size == csnXidMap->head);
+		xmin = csnXidMap->xmin_by_second[csn_seconds % csnXidMap->size];
+	}
+	else
+	{
+		/* requested snapshot_csn is too old, let caller know */
+		xmin = InvalidTransactionId;
+	}
+	LWLockRelease(CSNSnapshotXidMapLock);
+
+	return xmin;
+}
+
+/*
+ * CSNSnapshotPrepareCurrent
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+SnapshotCSN
+CSNSnapshotPrepareCurrent(void)
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (TransactionIdIsValid(xid))
+	{
+		TransactionId *subxids;
+		int nsubxids = xactGetCommittedChildren(&subxids);
+		CSNLogSetCSN(xid, nsubxids, subxids, InDoubtCSN, true);
+	}
+
+	/* Nothing to write if we don't have xid */
+
+	return GenerateCSN(false, InvalidCSN);
+}
+
+
+/*
+ * CSNSnapshotAssignCurrent
+ *
+ * Assign SnapshotCSN to the currently active transaction. SnapshotCSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ */
+void
+CSNSnapshotAssignCurrent(SnapshotCSN snapshot_csn)
+{
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(snapshot_csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal snapshot_csn")));
+
+	Assert(snapshot_csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnShared->last_max_csn value.
+	 */
+	GenerateCSN(false, snapshot_csn);
+
+	/* Set csn and defuse ProcArrayEndTransaction from assigning one */
+	pg_atomic_write_u64(&MyProc->assignedCSN, snapshot_csn);
+}
+
+/*
+ * CSNSnapshotSync
+ *
+ * Due to time desynchronization on different nodes we can receive snapshot_csn
+ * which is greater than snapshot_csn on this node. To preserve proper isolation
+ * this node needs to wait when such snapshot_csn comes on local clock.
+ *
+ * This should happend relatively rare if nodes have running NTP/PTP/etc.
+ * Complain if wait time is more than SNAP_SYNC_COMPLAIN.
+ */
+void
+CSNSnapshotSync(SnapshotCSN remote_csn)
+{
+	SnapshotCSN	local_csn;
+	SnapshotCSN	delta;
+
+	Assert(enable_csn_snapshot);
+
+	for(;;)
+	{
+		if (GetLastGeneratedCSN() > remote_csn)
+			return;
+
+		local_csn = GenerateCSN(true, InvalidCSN);
+
+		if (local_csn >= remote_csn)
+			/*
+			 * Everything is fine too, but last_max_csn wasn't updated for
+			 * some time.
+			 */
+			return;
+
+		/* Okay we need to sleep now */
+		delta = remote_csn - local_csn;
+		if (delta > SNAP_DESYNC_COMPLAIN)
+			ereport(WARNING,
+				(errmsg("remote global snapshot exceeds ours by more than a second"),
+				 errhint("Consider running NTPd on servers participating in global transaction")));
+
+		/* TODO: report this sleeptime somewhere? */
+		pg_usleep((long) (delta/NSECS_PER_USEC));
+
+		/*
+		 * Loop that checks to ensure that we actually slept for specified
+		 * amount of time.
+		 */
+	}
+
+	Assert(false); /* Should not happend */
+	return;
+}
+
+/*
+ * TransactionIdGetCSN
+ *
+ * Get CSN for specified TransactionId taking care about special xids,
+ * xids beyond TransactionXmin and InDoubt states.
+ */
+CSN
+TransactionIdGetCSN(TransactionId xid)
+{
+	CSN csn;
+
+	/* Handle permanent TransactionId's for which we don't have mapping */
+	if (!TransactionIdIsNormal(xid))
+	{
+		if (xid == InvalidTransactionId)
+			return AbortedCSN;
+		if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+			return FrozenCSN;
+		Assert(false); /* Should not happend */
+	}
+
+	/*
+	 * If we just switch a xid-snapsot to a csn_snapshot, we should handle a start
+	 * xid for csn base check. Just in case we have prepared transaction which
+	 * hold the TransactionXmin but without CSN.
+	 */
+	xmin_for_csn = GetOldestXmin();
+
+	/*
+	 * For the xid with 'xid >= TransactionXmin and xid < xmin_for_csn',
+	 * it defined as unclear csn which follow xid-snapshot result.
+	 */
+	if(!TransactionIdPrecedes(xid, TransactionXmin) &&
+							TransactionIdPrecedes(xid, xmin_for_csn))
+	{
+		elog(LOG, "UnclearCSN was returned. xid=%u, TransactionXmin=%u, xmin_for_csn=%u",
+			xid, TransactionXmin, xmin_for_csn);
+		return UnclearCSN;
+	}
+	/*
+	 * For xids which less then TransactionXmin CSNLog can be already
+	 * trimmed but we know that such transaction is definitely not concurrently
+	 * running according to any snapshot including timetravel ones. Callers
+	 * should check TransactionDidCommit after.
+	 */
+	if (TransactionIdPrecedes(xid, TransactionXmin))
+		return FrozenCSN;
+
+	/* Read CSN from SLRU */
+	csn = CSNLogGetCSNByXid(xid);
+
+	/*
+	 * If we faced InDoubt state then transaction is being committed and we
+	 * should wait until CSN will be assigned so that visibility check
+	 * could decide whether tuple is in snapshot. See also comments in
+	 * CSNSnapshotPrecommit().
+	 */
+	if (CSNIsInDoubt(csn))
+	{
+		XactLockTableWait(SubTransGetTopmostTransaction(xid), NULL, NULL, XLTW_None);
+		csn = CSNLogGetCSNByXid(xid);
+		Assert(CSNIsNormal(csn) || CSNIsAborted(csn));
+	}
+
+	Assert(CSNIsNormal(csn) || CSNIsInProgress(csn) || CSNIsAborted(csn));
+	return csn;
+}
+
+/*
+ * XidInCSNSnapshot
+ *
+ * Version of XidInMVCCSnapshot for transactions. For non-imported
+ * csn snapshots this should give same results as XidInLocalMVCCSnapshot
+ * (except that aborts will be shown as invisible without going to clog) and to
+ * ensure such behaviour XidInMVCCSnapshot is coated with asserts that checks
+ * identicalness of XidInCSNSnapshot/XidInLocalMVCCSnapshot in
+ * case of ordinary snapshot.
+ */
+bool
+XidInCSNSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	CSN csn;
+
+	csn = TransactionIdGetCSN(xid);
+
+	if (CSNIsNormal(csn))
+		return (csn >= snapshot->snapshot_csn);
+	else if (CSNIsFrozen(csn))
+	{
+		/* It is bootstrap or frozen transaction */
+		return false;
+	}
+	else if(CSNIsUnclear(csn))
+	{
+		/*
+		 * Some xid can not figure out csn because of snapshot switch,
+		 * and we can follow xid-base result.
+		 */
+		return true;
+	}
+	else
+	{
+		/* It is aborted or in-progress */
+		Assert(CSNIsAborted(csn) || CSNIsInProgress(csn));
+		if (CSNIsAborted(csn))
+			Assert(TransactionIdDidAbort(xid));
+		return true;
+	}
+}
+
+
+/*****************************************************************************
+ * Functions to handle transactions commit.
+ *
+ * For local transactions CSNSnapshotPrecommit sets InDoubt state before
+ * ProcArrayEndTransaction is called and transaction data potetntially becomes
+ * visible to other backends. ProcArrayEndTransaction (or ProcArrayRemove in
+ * twophase case) then acquires csn under ProcArray lock and stores it
+ * in proc->assignedCSN. It's important that csn for commit is
+ * generated under ProcArray lock, otherwise snapshots won't
+ * be equivalent. Consequent call to CSNSnapshotCommit will write
+ * proc->assignedCSN to CSNLog.
+ *
+ *
+ * CSNSnapshotAbort is slightly different comparing to commit because abort
+ * can skip InDoubt phase and can be called for transaction subtree.
+ *****************************************************************************/
+
+
+/*
+ * CSNSnapshotAbort
+ *
+ * Abort transaction in CsnLog. We can skip InDoubt state for aborts
+ * since no concurrent transactions allowed to see aborted data anyway.
+ */
+void
+CSNSnapshotAbort(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	if (!get_csnlog_status())
+		return;
+
+	CSNLogSetCSN(xid, nsubxids, subxids, AbortedCSN, true);
+
+	/*
+	 * Clean assignedCSN anyway, as it was possibly set in
+	 * XidSnapshotAssignCsnCurrent.
+	 */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
+
+/*
+ * CSNSnapshotPrecommit
+ *
+ * Set InDoubt status for local transaction that we are going to commit.
+ * This step is needed to achieve consistency between local snapshots and
+ * csn-based snapshots. We don't hold ProcArray lock while writing
+ * csn for transaction in SLRU but instead we set InDoubt status before
+ * transaction is deleted from ProcArray so the readers who will read csn
+ * in the gap between ProcArray removal and CSN assignment can wait
+ * until CSN is finally assigned. See also TransactionIdGetCSN().
+ *
+ * This should be called only from parallel group leader before backend is
+ * deleted from ProcArray.
+ */
+void
+CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid,
+					int nsubxids, TransactionId *subxids)
+{
+	CSN oldassignedCSN = InProgressCSN;
+	bool in_progress;
+
+	if (!get_csnlog_status())
+		return;
+
+	/* Set InDoubt status if it is local transaction */
+	in_progress = pg_atomic_compare_exchange_u64(&proc->assignedCSN,
+												 &oldassignedCSN,
+												 InDoubtCSN);
+	if (in_progress)
+	{
+		Assert(CSNIsInProgress(oldassignedCSN));
+		CSNLogSetCSN(xid, nsubxids, subxids, InDoubtCSN, true);
+	}
+	else
+	{
+		/* Otherwise we should have valid CSN by this time */
+		Assert(CSNIsNormal(oldassignedCSN));
+		Assert(CSNIsInDoubt(CSNLogGetCSNByXid(xid)));
+	}
+}
+
+/*
+ * CSNSnapshotCommit
+ *
+ * Write CSN that were acquired earlier to CsnLog. Should be
+ * preceded by CSNSnapshotPrecommit() so readers can wait until we finally
+ * finished writing to SLRU.
+ *
+ * Should be called after ProcArrayEndTransaction, but before releasing
+ * transaction locks, so that TransactionIdGetCSN can wait on this
+ * lock for CSN.
+ */
+void
+CSNSnapshotCommit(PGPROC *proc, TransactionId xid,
+				  int nsubxids, TransactionId *subxids)
+{
+	volatile CSN assignedCSN;
+
+	if (!get_csnlog_status())
+		return;
+
+	if (!TransactionIdIsValid(xid))
+	{
+		assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+		Assert(CSNIsInProgress(assignedCSN));
+		return;
+	}
+
+	/* Finally write resulting CSN in SLRU */
+	assignedCSN = pg_atomic_read_u64(&proc->assignedCSN);
+	Assert(CSNIsNormal(assignedCSN));
+	CSNLogSetCSN(xid, nsubxids, subxids, assignedCSN, true);
+
+	/* Reset for next transaction */
+	pg_atomic_write_u64(&proc->assignedCSN, InProgressCSN);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..b86c172e46 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -10,6 +10,7 @@
 #include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/generic_xlog.h"
 #include "access/ginxlog.h"
 #include "access/gistxlog.h"
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 28b153abc3..7bc6aae9a4 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
@@ -1536,8 +1537,34 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nabortrels, abortrels,
 									   gid);
 
+	/*
+	 * CSNSnapshot callbacks that should be called right before we are
+	 * going to become visible. Details in comments to this functions.
+	 */
+	if (isCommit)
+		CSNSnapshotPrecommit(proc, xid, hdr->nsubxacts, children);
+	else
+		CSNSnapshotAbort(proc, xid, hdr->nsubxacts, children);
+
+
 	ProcArrayRemove(proc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CSNLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks, since TransactionIdGetCSN relies on
+	 * XactLockTableWait to await csn.
+	 */
+	if (isCommit)
+	{
+		CSNSnapshotCommit(proc, xid, hdr->nsubxacts, children);
+	}
+	else
+	{
+		Assert(CSNIsInProgress(
+				   pg_atomic_read_u64(&proc->assignedCSN)));
+	}
+
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
 	 * no one else will try to commit/rollback, and so it will be recycled if
@@ -2583,3 +2610,130 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
 	LWLockRelease(TwoPhaseStateLock);
 	return found;
 }
+
+/*
+ * CSNSnapshotPrepareTwophase
+ *
+ * Set InDoubt state for currently active transaction and return commit's
+ * global snapshot.
+ */
+static SnapshotCSN
+CSNSnapshotPrepareTwophase(const char *gid)
+{
+	GlobalTransaction	gxact;
+	PGPROC				*proc;
+	char				*buf;
+	TransactionId		xid;
+	xl_xact_parsed_prepare parsed;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+	xid = proc->xid;
+
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, true);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+	ParsePrepareRecord(0, (xl_xact_prepare *)buf, &parsed);
+
+	CSNLogSetCSN(xid, parsed.nsubxacts,
+					parsed.subxacts, InDoubtCSN, true);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	pfree(buf);
+	return GenerateCSN(false, InvalidCSN);
+}
+
+/*
+ * CSNSnapshotAssignTwoPhase
+ *
+ * Asign SnapshotCSN for currently active transaction. SnapshotCSN is supposedly
+ * maximal among of values returned by CSNSnapshotPrepareCurrent and
+ * pg_csn_snapshot_prepare.
+ *
+ * This function is a counterpart of CSNSnapshotAssignCurrent() for
+ * twophase transactions.
+ */
+static void
+CSNSnapshotAssignTwoPhase(const char *gid, SnapshotCSN csn)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+
+	if (!enable_csn_snapshot)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("could not prepare transaction for global commit"),
+				errhint("Make sure the configuration parameter \"%s\" is enabled.",
+						"enable_csn_snapshot")));
+
+	if (!CSNIsNormal(csn))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_csn_snapshot_assign expects normal snapshot_csn")));
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to access the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	Assert(csn != InvalidCSN);
+	/* We do not care the Generate result, we just want to make sure max
+	 * csnShared->last_max_csn value.
+	 */
+	GenerateCSN(false, csn);
+	/* Set snapshot_csn and defuse ProcArrayRemove from assigning one. */
+	pg_atomic_write_u64(&proc->assignedCSN, csn);
+
+	/* Unlock our GXACT */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	gxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * SQL interface to CSNSnapshotPrepareTwophase()
+ *
+ * TODO: Rewrite this as PREPARE TRANSACTION 'gid' RETURNING SNAPSHOT
+ */
+Datum
+pg_csn_snapshot_prepare(PG_FUNCTION_ARGS)
+{
+	const char 	*gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	SnapshotCSN	csn = CSNSnapshotPrepareTwophase(gid);
+
+	PG_RETURN_INT64(csn);
+}
+
+/*
+ * SQL interface to CSNSnapshotAssignTwoPhase()
+ *
+ * TODO: Rewrite this as COMMIT PREPARED 'gid' SNAPSHOT 'csn'
+ */
+Datum
+pg_csn_snapshot_assign(PG_FUNCTION_ARGS)
+{
+	const char *gid = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	SnapshotCSN	csn = PG_GETARG_INT64(1);
+
+	CSNSnapshotAssignTwoPhase(gid, csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index a6e98e71bd..8e1d074806 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -175,6 +176,7 @@ GetNewTransactionId(bool isSubXact)
 	 * Extend pg_subtrans and pg_commit_ts too.
 	 */
 	ExtendCLOG(xid);
+	ExtendCSNLog(xid);
 	ExtendCommitTs(xid);
 	ExtendSUBTRANS(xid);
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 8e35c432f5..e6baf880d9 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_snapshot.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -1418,6 +1419,12 @@ RecordTransactionCommit(void)
 		TransactionTreeSetCommitTsData(xid, nchildren, children,
 									   replorigin_session_origin_timestamp,
 									   replorigin_session_origin);
+
+		/*
+		 * Mark our transaction as InDoubt in CsnLog and get ready for
+		 * commit.
+		 */
+		CSNSnapshotPrecommit(MyProc, xid, nchildren, children);
 	}
 
 	/*
@@ -1772,6 +1779,9 @@ RecordTransactionAbort(bool isSubXact)
 	 */
 	TransactionIdAbortTree(xid, nchildren, children);
 
+	/* Mark our transaction as Aborted in CSN Log. */
+	CSNSnapshotAbort(MyProc, xid, nchildren, children);
+
 	END_CRIT_SECTION();
 
 	/* Compute latestXid while we have the child XIDs handy */
@@ -2114,6 +2124,13 @@ StartTransaction(void)
 	ShowTransactionState("StartTransaction");
 }
 
+Datum
+pg_current_csn(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN	csn = GenerateCSN(false, InvalidCSN);
+
+	PG_RETURN_INT64(csn);
+}
 
 /*
  *	CommitTransaction
@@ -2262,6 +2279,21 @@ CommitTransaction(void)
 	 */
 	ProcArrayEndTransaction(MyProc, latestXid);
 
+	/*
+	 * Stamp our transaction with CSN in CsnLog.
+	 * Should be called after ProcArrayEndTransaction, but before releasing
+	 * transaction locks.
+	 */
+	if (!is_parallel_worker)
+	{
+		TransactionId xid = GetTopTransactionIdIfAny();
+		TransactionId *subxids;
+		int nsubxids;
+
+		nsubxids = xactGetCommittedChildren(&subxids);
+		CSNSnapshotCommit(MyProc, xid, nsubxids, subxids);
+	}
+
 	/*
 	 * This is all post-commit cleanup.  Note that if an error is raised here,
 	 * it's too late to abort the transaction.  This should be just
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1616448368..2a8de10038 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -4747,6 +4748,7 @@ InitControlFile(uint64 sysidentifier)
 	ControlFile->wal_level = wal_level;
 	ControlFile->wal_log_hints = wal_log_hints;
 	ControlFile->track_commit_timestamp = track_commit_timestamp;
+	ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
 }
 
@@ -7181,6 +7183,9 @@ StartupXLOG(void)
 	if (ControlFile->track_commit_timestamp)
 		StartupCommitTs();
 
+	if(ControlFile->enable_csn_snapshot)
+		StartupCSN();
+
 	/*
 	 * Recover knowledge about replay progress of known replication partners.
 	 */
@@ -7448,6 +7453,8 @@ StartupXLOG(void)
 			 */
 			StartupSUBTRANS(oldestActiveXID);
 
+			CSNSnapshotStartup(oldestActiveXID);
+
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
 			 * nothing was running on the primary at this point. So fake-up an
@@ -8117,7 +8124,10 @@ StartupXLOG(void)
 	 * timestamps are started below, if necessary.)
 	 */
 	if (standbyState == STANDBY_DISABLED)
+	{
 		StartupSUBTRANS(oldestActiveXID);
+		CSNSnapshotStartup(oldestActiveXID);
+	}
 
 	/*
 	 * Perform end of recovery actions for any SLRUs that need it.
@@ -8183,6 +8193,7 @@ StartupXLOG(void)
 	 * commit timestamp.
 	 */
 	CompleteCommitTsInitialization();
+	CompleteCSNInitialization();
 
 	/*
 	 * All done with end-of-recovery actions.
@@ -9616,6 +9627,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	CheckPointCLOG();
+	CheckPointCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -9894,7 +9906,10 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
+	{
 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+		TruncateCSNLog(GetOldestTransactionIdConsideredRunning());
+	}
 
 	/* Real work is done; log and update stats. */
 	LogCheckpointEnd(true);
@@ -10172,7 +10187,8 @@ XLogReportParameters(void)
 		max_wal_senders != ControlFile->max_wal_senders ||
 		max_prepared_xacts != ControlFile->max_prepared_xacts ||
 		max_locks_per_xact != ControlFile->max_locks_per_xact ||
-		track_commit_timestamp != ControlFile->track_commit_timestamp)
+		track_commit_timestamp != ControlFile->track_commit_timestamp ||
+		enable_csn_snapshot != ControlFile->enable_csn_snapshot)
 	{
 		/*
 		 * The change in number of backend slots doesn't need to be WAL-logged
@@ -10194,6 +10210,7 @@ XLogReportParameters(void)
 			xlrec.wal_level = wal_level;
 			xlrec.wal_log_hints = wal_log_hints;
 			xlrec.track_commit_timestamp = track_commit_timestamp;
+			xlrec.enable_csn_snapshot = enable_csn_snapshot;
 
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
@@ -10212,6 +10229,7 @@ XLogReportParameters(void)
 		ControlFile->wal_level = wal_level;
 		ControlFile->wal_log_hints = wal_log_hints;
 		ControlFile->track_commit_timestamp = track_commit_timestamp;
+		ControlFile->enable_csn_snapshot = enable_csn_snapshot;
 		UpdateControlFile();
 
 		LWLockRelease(ControlFileLock);
@@ -10665,6 +10683,9 @@ xlog_redo(XLogReaderState *record)
 		CommitTsParameterChange(xlrec.track_commit_timestamp,
 								ControlFile->track_commit_timestamp);
 		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+		CSNlogParameterChange(xlrec.enable_csn_snapshot,
+								ControlFile->enable_csn_snapshot);
+		ControlFile->enable_csn_snapshot = xlrec.enable_csn_snapshot;
 
 		UpdateControlFile();
 		LWLockRelease(ControlFileLock);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 689384a411..e6585a94ba 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -73,6 +73,8 @@ typedef struct
 	char		compressed_page[COMPRESS_BUFSIZE];
 } registered_buffer;
 
+bool enable_csn_wal = true;
+
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
 static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5c4bc15b44..e64ada86c7 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -53,7 +53,7 @@
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
-
+#include "access/csn_log.h"
 
 /*
  * GUC parameters
@@ -1760,6 +1760,7 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	TruncateCLOG(frozenXID, oldestxid_datoid);
 	TruncateCommitTs(frozenXID);
+	TruncateCSNLog(frozenXID);
 	TruncateMultiXact(minMulti, minmulti_datoid);
 
 	/*
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index dbdc172a2b..214921dc9f 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -120,6 +120,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/csn_log.h"
 #include "access/heapam_xlog.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -489,6 +490,9 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
 	snapshot->xmin = builder->xmin;
 	snapshot->xmax = builder->xmax;
 
+	snapshot->snapshot_csn = FrozenCSN;
+	snapshot->imported_csn = false;
+
 	/* store all transactions to be treated as committed by this snapshot */
 	snapshot->xip =
 		(TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 9fa3e0631e..2a7e184da9 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,8 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
+#include "access/csn_snapshot.h"
 #include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -120,6 +122,8 @@ CalculateShmemSize(int *num_semaphores)
 	size = add_size(size, ProcGlobalShmemSize());
 	size = add_size(size, XLOGShmemSize());
 	size = add_size(size, CLOGShmemSize());
+	size = add_size(size, CSNLogShmemSize());
+	size = add_size(size, CSNSnapshotShmemSize());
 	size = add_size(size, CommitTsShmemSize());
 	size = add_size(size, SUBTRANSShmemSize());
 	size = add_size(size, TwoPhaseShmemSize());
@@ -242,6 +246,8 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	CSNLogShmemInit();
+	CSNSnapshotShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 892f0f6799..5bc7370c73 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -48,6 +48,7 @@
 #include <signal.h>
 
 #include "access/clog.h"
+#include "access/csn_snapshot.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -96,6 +97,8 @@ typedef struct ProcArrayStruct
 	TransactionId replication_slot_xmin;
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
+	/* xmin of oldest active csn snapshot */
+	TransactionId csn_snapshot_xmin;
 
 	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
@@ -429,6 +432,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		procArray->csn_snapshot_xmin = InvalidTransactionId;
 		ShmemVariableCache->xactCompletionCount = 1;
 	}
 
@@ -577,6 +581,14 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		/* Advance global latestCompletedXid while holding the lock */
 		MaintainLatestCompletedXid(latestXid);
 
+		/*
+		 * Assign xid csn while holding ProcArrayLock for non-distributed
+		 * COMMIT PREPARED. After lock is released consequent
+		 * CSNSnapshotCommit() will write this value to CsnLog.
+		 */
+		if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+			pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(false, InvalidCSN));
+
 		/* Same with xactCompletionCount  */
 		ShmemVariableCache->xactCompletionCount++;
 
@@ -691,6 +703,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		proc->xmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
+		proc->originalXmin = InvalidTransactionId;
 
 		/* must be cleared with xid/xmin: */
 		/* avoid unnecessarily dirtying shared cachelines */
@@ -730,6 +743,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	proc->xmin = InvalidTransactionId;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
 	proc->recoveryConflictPending = false;
+	proc->originalXmin = InvalidTransactionId;
 
 	/* must be cleared with xid/xmin: */
 	/* avoid unnecessarily dirtying shared cachelines */
@@ -753,6 +767,16 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
 
+	/*
+	 * Assign xid csn while holding ProcArrayLock for
+	 * COMMIT.
+	 *
+	 * TODO: in case of group commit we can generate one CSNSnapshot for
+	 * whole group to save time on timestamp aquisition.
+	 */
+	if (CSNIsInDoubt(pg_atomic_read_u64(&proc->assignedCSN)))
+		pg_atomic_write_u64(&proc->assignedCSN, GenerateCSN(false, InvalidCSN));
+
 	/* Same with xactCompletionCount  */
 	ShmemVariableCache->xactCompletionCount++;
 }
@@ -912,6 +936,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
+	proc->originalXmin = InvalidTransactionId;
 
 	Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
 	Assert(!proc->delayChkpt);
@@ -1204,6 +1229,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
 	{
 		ExtendSUBTRANS(latestObservedXid);
+		ExtendCSNLog(latestObservedXid);
 		TransactionIdAdvance(latestObservedXid);
 	}
 	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
@@ -1704,6 +1730,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	TransactionId kaxmin;
 	bool		in_recovery = RecoveryInProgress();
 	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -1843,6 +1870,10 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	if (in_recovery)
 		kaxmin = KnownAssignedXidsGetOldestXmin();
 
+	/* Get value of xmin, delayed by a CSN snapshot settings. */
+	if (get_csnlog_status() && csn_snapshot_defer_time > 0 && IsUnderPostmaster)
+		csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
+
 	/*
 	 * No other information from shared state is needed, release the lock
 	 * immediately. The rest of the computations can be done without a lock.
@@ -1899,6 +1930,15 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	h->data_oldest_nonremovable =
 		TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
 
+	/*
+	 * Hold non-removable border because distributed transactions
+	 * can wish to see old data.
+	 */
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable, csn_snapshot_xmin);
+	h->data_oldest_nonremovable =
+		TransactionIdOlder(h->data_oldest_nonremovable, csn_snapshot_xmin);
+
 	/*
 	 * The only difference between catalog / data horizons is that the slot's
 	 * catalog xmin is applied to the catalog one (so catalogs can be accessed
@@ -2133,6 +2173,9 @@ GetSnapshotDataReuse(Snapshot snapshot)
 	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
 		return false;
 
+	if (get_csnlog_status())
+		return false;
+
 	/*
 	 * If the current xactCompletionCount is still the same as it was at the
 	 * time the snapshot was built, we can be sure that rebuilding the
@@ -2212,6 +2255,8 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	CSN			csn = FrozenCSN;
+	TransactionId csn_snapshot_xmin = InvalidTransactionId;
 	FullTransactionId latest_completed;
 	TransactionId oldestxid;
 	int			mypgxactoff;
@@ -2444,6 +2489,20 @@ GetSnapshotData(Snapshot snapshot)
 	if (!TransactionIdIsValid(MyProc->xmin))
 		MyProc->xmin = TransactionXmin = xmin;
 
+	/* Take CSN under ProcArrayLock so the snapshot stays synchronized. */
+	if (!snapshot->takenDuringRecovery && get_csnlog_status())
+		csn = GenerateCSN(false, InvalidCSN);
+
+	if (get_csnlog_status() && csn_snapshot_defer_time > 0 && IsUnderPostmaster)
+	{
+		CSNSnapshotMapXmin(snapshot->snapshot_csn);
+
+		/* Get value of xmin, delayed by a CSN snapshot settings. */
+		csn_snapshot_xmin = ProcArrayGetCSNSnapshotXmin();
+		/* Adjust an oldest xid value with a xmin, delayed by CSN options. */
+		oldestxid = TransactionIdOlder(oldestxid, csn_snapshot_xmin);
+	}
+
 	LWLockRelease(ProcArrayLock);
 
 	/* maintain state for GlobalVis* */
@@ -2469,6 +2528,10 @@ GetSnapshotData(Snapshot snapshot)
 		def_vis_xid_data =
 			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
 
+		/* The csn-related settings can require an older xmin. */
+		def_vis_xid_data =
+			TransactionIdOlder(def_vis_xid_data, csn_snapshot_xmin);
+
 		/*
 		 * Rows in non-shared, non-catalog tables possibly could be vacuumed
 		 * if older than this xid.
@@ -2549,6 +2612,8 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->active_count = 0;
 	snapshot->regd_count = 0;
 	snapshot->copied = false;
+	snapshot->imported_csn = false;
+	snapshot->snapshot_csn = csn;
 
 	GetSnapshotDataInitOldSnapshot(snapshot);
 
@@ -3901,6 +3966,25 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
+/*
+ * ProcArraySetCSNSnapshotXmin
+ */
+void
+ProcArraySetCSNSnapshotXmin(TransactionId xmin)
+{
+	/* We rely on atomic fetch/store of xid */
+	procArray->csn_snapshot_xmin = xmin;
+}
+
+/*
+ * ProcArrayGetCSNSnapshotXmin
+ */
+TransactionId
+ProcArrayGetCSNSnapshotXmin(void)
+{
+	return procArray->csn_snapshot_xmin;
+}
+
 /*
  * XidCacheRemoveRunningXids
  *
@@ -4383,6 +4467,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
+			ExtendCSNLog(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 862097352b..1f78161d9a 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -135,6 +135,8 @@ static const char *const BuiltinTrancheNames[] = {
 	"CommitTSBuffer",
 	/* LWTRANCHE_SUBTRANS_BUFFER: */
 	"SubtransBuffer",
+	/* LWTRANCHE_CSN_LOG_BUFFERS */
+	"CSNLogBuffer",
 	/* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
 	"MultiXactOffsetBuffer",
 	/* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c295..e8ca393611 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,5 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+CSNLogSLRULock				    	48
+CSNSnapshotXidMapLock			    49
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index b7d9da0aa9..88f4f42456 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -35,9 +35,11 @@
 #include <unistd.h>
 #include <sys/time.h>
 
+#include "access/csn_snapshot.h"
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xlogutils.h"
+#include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
@@ -440,6 +442,9 @@ InitProcess(void)
 	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
 	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
 
+	MyProc->originalXmin = InvalidTransactionId;
+	pg_atomic_init_u64(&MyProc->assignedCSN, InProgressCSN);
+
 	/*
 	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
 	 * on it.  That allows us to repoint the process latch, which so far
@@ -585,6 +590,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
 	MyProc->waitProcLock = NULL;
+	MyProc->originalXmin = InvalidTransactionId;
 	pg_atomic_write_u64(&MyProc->waitStart, 0);
 #ifdef USE_ASSERT_CHECKING
 	{
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index d4083e8a56..383f1e4566 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -20,6 +20,7 @@
 
 #include "access/commit_ts.h"
 #include "access/clog.h"
+#include "access/csn_log.h"
 #include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
@@ -119,6 +120,10 @@ static const SyncOps syncsw[] = {
 	/* pg_multixact/members */
 	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
 		.sync_syncfiletag = multixactmemberssyncfiletag
+	},
+	/* pg_multixact/members */
+	[SYNC_HANDLER_CSN] = {
+		.sync_syncfiletag = csnsyncfiletag
 	}
 };
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e91d5a3cfd..4d9833fb5f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -34,6 +34,7 @@
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
+#include "access/csn_snapshot.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
 #include "access/toast_compression.h"
@@ -1212,6 +1213,24 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_csn_snapshot", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-base snapshot."),
+			gettext_noop("Used to achieve REPEATABLE READ isolation level for postgres_fdw transactions.")
+		},
+		&enable_csn_snapshot,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_csn_wal", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Enable csn-wal record."),
+			gettext_noop("Used to enable csn-wal record")
+		},
+		&enable_csn_wal,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"ssl", PGC_SIGHUP, CONN_AUTH_SSL,
 			gettext_noop("Enables SSL connections."),
@@ -3195,6 +3214,24 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"csn_snapshot_defer_time", PGC_POSTMASTER, REPLICATION_PRIMARY,
+			gettext_noop("Minimal age of records which allowed to be vacuumed, in seconds."),
+			NULL
+		},
+		&csn_snapshot_defer_time,
+		0, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+	{
+		{"csn_time_shift", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Do the time shift in the CSN generator."),
+			gettext_noop("Used for debug purposes.")
+		},
+		&csn_time_shift,
+		0, INT_MIN, INT_MAX,
+		NULL, NULL, NULL
+	},
 	{
 		{"block_size", PGC_INTERNAL, PRESET_OPTIONS,
 			gettext_noop("Shows the size of a disk block."),
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index b0c50a3c7f..3fcd0f4ccf 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe csnlog__checkpoint__start(bool);
+	probe csnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5001efdf7a..5a52fce2ed 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -77,6 +78,8 @@
  */
 int			old_snapshot_threshold; /* number of minutes, -1 disables */
 
+bool		enable_csn_snapshot;
+
 volatile OldSnapshotControlData *oldSnapshotControl;
 
 
@@ -173,6 +176,7 @@ static TimestampTz AlignTimestampToMinuteBoundary(TimestampTz ts);
 static Snapshot CopySnapshot(Snapshot snapshot);
 static void FreeSnapshot(Snapshot snapshot);
 static void SnapshotResetXmin(void);
+static bool XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
 /*
  * Snapshot fields to be serialized.
@@ -191,6 +195,8 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	CSN			csn;
+	bool		imported_csn;
 } SerializedSnapshotData;
 
 Size
@@ -544,6 +550,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
 		   sourcesnap->subxcnt * sizeof(TransactionId));
 	CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed;
 	CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
+	CurrentSnapshot->snapshot_csn = sourcesnap->snapshot_csn;
+	CurrentSnapshot->imported_csn = sourcesnap->imported_csn;
 	/* NB: curcid should NOT be copied, it's a local matter */
 
 	CurrentSnapshot->snapXactCompletionCount = 0;
@@ -1209,6 +1217,10 @@ ExportSnapshot(Snapshot snapshot)
 	appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin);
 	appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax);
 
+	appendStringInfo(&buf, "snapshot_csn:"UINT64_FORMAT"\n",
+					 snapshot->snapshot_csn);
+	appendStringInfo(&buf, "imported_csn:%u\n", snapshot->imported_csn);
+
 	/*
 	 * We must include our own top transaction ID in the top-xid data, since
 	 * by definition we will still be running when the importing transaction
@@ -1333,6 +1345,31 @@ parseIntFromText(const char *prefix, char **s, const char *filename)
 	return val;
 }
 
+static CSN
+parseCSNFromText(const char *prefix, char **s, const char *filename)
+{
+	char	   *ptr = *s;
+	int			prefixlen = strlen(prefix);
+	uint64		val;
+
+	if (strncmp(ptr, prefix, prefixlen) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid snapshot data in file \"%s\"", filename)));
+	ptr += prefixlen;
+	if (sscanf(ptr, UINT64_FORMAT, &val) != 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid snapshot data in file \"%s\"", filename)));
+	ptr = strchr(ptr, '\n');
+	if (!ptr)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid snapshot data in file \"%s\"", filename)));
+	*s = ptr + 1;
+	return val;
+}
+
 static TransactionId
 parseXidFromText(const char *prefix, char **s, const char *filename)
 {
@@ -1474,6 +1511,9 @@ ImportSnapshot(const char *idstr)
 	snapshot.xmin = parseXidFromText("xmin:", &filebuf, path);
 	snapshot.xmax = parseXidFromText("xmax:", &filebuf, path);
 
+	snapshot.snapshot_csn = parseCSNFromText("snapshot_csn:", &filebuf, path);
+	snapshot.imported_csn = parseIntFromText("imported_csn:", &filebuf, path);
+
 	snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path);
 
 	/* sanity-check the xid count before palloc */
@@ -2130,6 +2170,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.csn = snapshot->snapshot_csn;
+	serialized_snapshot.imported_csn = snapshot->imported_csn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -2204,6 +2246,8 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->snapshot_csn = serialized_snapshot.csn;
+	snapshot->imported_csn = serialized_snapshot.imported_csn;
 	snapshot->snapXactCompletionCount = 0;
 
 	/* Copy XIDs, if present. */
@@ -2245,6 +2289,44 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 
 /*
  * XidInMVCCSnapshot
+ *
+ * Check whether this xid is in snapshot. When enable_csn_snapshot is
+ * switched off just call XidInLocalMVCCSnapshot().
+ */
+bool
+XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+{
+	bool in_snapshot;
+
+	if (snapshot->imported_csn)
+	{
+		Assert(enable_csn_snapshot);
+		/* No point to using snapshot info except CSN */
+		return XidInCSNSnapshot(xid, snapshot);
+	}
+
+	in_snapshot = XidInLocalMVCCSnapshot(xid, snapshot);
+
+	if (!get_csnlog_status())
+	{
+		Assert(CSNIsFrozen(snapshot->snapshot_csn));
+		return in_snapshot;
+	}
+
+	if (in_snapshot)
+	{
+		/*
+		 * This xid may be already in unknown state and in that case
+		 * we must wait and recheck.
+		 */
+		return XidInCSNSnapshot(xid, snapshot);
+	}
+	else
+		return false;
+}
+
+/*
+ * XidInLocalMVCCSnapshot
  *		Is the given XID still-in-progress according to the snapshot?
  *
  * Note: GetSnapshotData never stores either top xid or subxids of our own
@@ -2253,8 +2335,8 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
  * TransactionIdIsCurrentTransactionId first, except when it's known the
  * XID could not be ours anyway.
  */
-bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+static bool
+XidInLocalMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
 	uint32		i;
 
@@ -2364,3 +2446,100 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+
+/*
+ * ExportCSNSnapshot
+ *
+ * Export snapshot_csn so that caller can expand this transaction to other
+ * nodes.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+SnapshotCSN
+ExportCSNSnapshot()
+{
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not export csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	elog(DEBUG5, "Export CSN Snapshot: csn = %lu",
+		 CurrentSnapshot->snapshot_csn);
+	return CurrentSnapshot->snapshot_csn;
+}
+
+/* SQL accessor to ExportCSNSnapshot() */
+Datum
+pg_csn_snapshot_export(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN csn = ExportCSNSnapshot();
+
+	PG_RETURN_UINT64(csn);
+}
+
+/*
+ * ImportCSNSnapshot
+ *
+ * Import csn and retract this backends xmin to the value that was
+ * actual when we had such csn.
+ *
+ * TODO: it's better to do this through EXPORT/IMPORT SNAPSHOT syntax and
+ * add some additional checks that transaction did not yet acquired xid, but
+ * for current iteration of this patch I don't want to hack on parser.
+ */
+void
+ImportCSNSnapshot(SnapshotCSN snapshot_csn)
+{
+	volatile TransactionId xmin;
+
+	if (!get_csnlog_status())
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is enabled.",
+					 "enable_csn_snapshot")));
+
+	if (csn_snapshot_defer_time <= 0)
+		ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not import csn snapshot"),
+			 errhint("Make sure the configuration parameter \"%s\" is positive.",
+					 "csn_snapshot_defer_time")));
+
+	/*
+	 * Call CSNSnapshotToXmin under ProcArrayLock to avoid situation that
+	 * resulting xmin will be evicted from map before we will set it into our
+	 * backend's xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	xmin = CSNSnapshotToXmin(snapshot_csn);
+	if (!TransactionIdIsValid(xmin))
+	{
+		LWLockRelease(ProcArrayLock);
+		elog(ERROR, "CSNSnapshotToXmin: csn snapshot too old");
+	}
+
+	MyProc->originalXmin = MyProc->xmin;
+	MyProc->xmin = TransactionXmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	CurrentSnapshot->xmin = xmin; /* defuse SnapshotResetXmin() */
+	CurrentSnapshot->snapshot_csn = snapshot_csn;
+	CurrentSnapshot->imported_csn = true;
+	CSNSnapshotSync(snapshot_csn);
+}
+
+/* SQL accessor to ImportCSNSnapshot() */
+Datum
+pg_csn_snapshot_import(PG_FUNCTION_ARGS)
+{
+	SnapshotCSN csn = PG_GETARG_UINT64(0);
+
+	ImportCSNSnapshot(csn);
+	PG_RETURN_VOID();
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 31839c1a19..1864952bd2 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -227,7 +227,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_csn"
 };
 
 
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index f911f98d94..325e6a0e2b 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -300,6 +300,8 @@ main(int argc, char *argv[])
 		   ControlFile->max_locks_per_xact);
 	printf(_("track_commit_timestamp setting:       %s\n"),
 		   ControlFile->track_commit_timestamp ? _("on") : _("off"));
+	printf(_("enable_csn_snapshot setting:    	    %s\n"),
+		   ControlFile->enable_csn_snapshot ? 	 _("on") : _("off"));
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile->maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 3628bd74a7..18cf9197cc 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -548,6 +548,11 @@ copy_xact_xlog_xid(void)
 		check_ok();
 	}
 
+	if(old_cluster.controldata.cat_ver > CSN_BASE_SNAPSHOT_ADD_VER)
+	{
+		copy_subdir_files("pg_csn", "pg_csn");
+	}
+
 	/* now reset the wal archives in the new cluster */
 	prep_status("Resetting WAL archives");
 	exec_prog(UTILITY_LOG_FILE, NULL, true, true,
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index ca0795f68f..54f2984387 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -124,6 +124,8 @@ extern char *output_files[];
  */
 #define JSONB_FORMAT_CHANGE_CAT_VER 201409291
 
+#define	CSN_BASE_SNAPSHOT_ADD_VER	202002010
+
 
 /*
  * Each relation is represented by a relinfo structure.
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..2d280ce940 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -11,6 +11,7 @@
 #include "access/brin_xlog.h"
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/generic_xlog.h"
 #include "access/ginxlog.h"
 #include "access/gistxlog.h"
diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h
new file mode 100644
index 0000000000..12df028bf4
--- /dev/null
+++ b/src/include/access/csn_log.h
@@ -0,0 +1,98 @@
+/*
+ * csn_log.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "utils/snapshot.h"
+#include "storage/sync.h"
+
+
+#define InProgressCSN	 	UINT64CONST(0x0)
+#define AbortedCSN	 		UINT64CONST(0x1)
+#define FrozenCSN		 	UINT64CONST(0x2)
+#define InDoubtCSN	 		UINT64CONST(0x3)
+#define UnclearCSN	 		UINT64CONST(0x4)
+#define FirstNormalCSN 		UINT64CONST(0x5)
+
+#define CSNIsInProgress(csn)	((csn) == InProgressCSN)
+#define CSNIsAborted(csn)		((csn) == AbortedCSN)
+#define CSNIsFrozen(csn)		((csn) == FrozenCSN)
+#define CSNIsInDoubt(csn)		((csn) == InDoubtCSN)
+#define CSNIsUnclear(csn)		((csn) == UnclearCSN)
+#define CSNIsNormal(csn)		((csn) >= FirstNormalCSN)
+
+/* XLOG stuff */
+#define XLOG_CSN_ASSIGNMENT			0x00
+#define XLOG_CSN_SETCSN				0x10
+#define XLOG_CSN_ZEROPAGE			0x20
+#define XLOG_CSN_TRUNCATE			0x30
+
+/*
+ * We should log MAX generated CSN to wal, so that database will not generate
+ * a historical CSN after database restart. This may appear when system time
+ * turned back.
+ *
+ * However we can not log the MAX CSN every time it generated, if so it will
+ * cause too many wal expend, so we log it 5s more in the future.
+ *
+ * As a trade off, when this database restart, there will be 5s bad performance
+ * for time synchronization among sharding nodes.
+ *
+ * It looks like we can redefine this as a configure parameter, and the user
+ * can decide which way they prefer.
+ *
+ */
+#define	CSN_ASSIGN_TIME_INTERVAL	5
+
+typedef struct xl_csn_set
+{
+	CSN				csn;
+	TransactionId	xtop;			/* XID's top-level XID */
+	int				nsubxacts;		/* number of subtransaction XIDs */
+	TransactionId	xsub[FLEXIBLE_ARRAY_MEMBER];	/* assigned subxids */
+} xl_csn_set;
+
+#define MinSizeOfCSNSet offsetof(xl_csn_set, xsub)
+#define	CSNAddByNanosec(csn,second) (csn + second * 1000000000L)
+
+/* Main functions */
+extern void CSNLogSetCSN(TransactionId xid, int nsubxids,
+							   TransactionId *subxids, CSN csn, bool write_xlog);
+extern CSN CSNLogGetCSNByXid(TransactionId xid);
+
+/* Infrastructure functions */
+extern Size CSNLogShmemSize(void);
+extern void CSNLogShmemInit(void);
+extern void ActivateCSNlog(void);
+extern void ExtendCSNLog(TransactionId newestXact);
+extern void DeactivateCSNlog(void);
+
+extern void CheckPointCSNLog(void);
+extern void TruncateCSNLog(TransactionId oldestXact);
+
+extern void csnlog_redo(XLogReaderState *record);
+extern void csnlog_desc(StringInfo buf, XLogReaderState *record);
+extern const char *csnlog_identify(uint8 info);
+extern void WriteAssignCSNXlogRec(CSN csn);
+extern void CatchCSNLog(void);
+extern void StartupCSN(void);
+extern void CompleteCSNInitialization(void);
+extern void CSNlogParameterChange(bool newvalue, bool oldvalue);
+extern bool get_csnlog_status(void);
+extern int csnsyncfiletag(const FileTag *ftag, char *path);
+
+extern CSN GenerateCSN(bool locked, CSN assign);
+extern CSN GetLastGeneratedCSN(void);
+
+extern TransactionId GetOldestXmin(void);
+
+#endif   /* CSNLOG_H */
\ No newline at end of file
diff --git a/src/include/access/csn_snapshot.h b/src/include/access/csn_snapshot.h
new file mode 100644
index 0000000000..916603af0c
--- /dev/null
+++ b/src/include/access/csn_snapshot.h
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * csn_snapshot.h
+ *	  Support for cross-node snapshot isolation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_snapshot.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CSN_SNAPSHOT_H
+#define CSN_SNAPSHOT_H
+
+#include "access/csn_log.h"
+#include "port/atomics.h"
+#include "storage/lock.h"
+#include "utils/snapshot.h"
+#include "utils/guc.h"
+
+/*
+ * snapshot.h is used in frontend code so atomic variant of SnapshotCSN type
+ * is defined here.
+ */
+typedef pg_atomic_uint64 CSN_atomic;
+
+
+extern int csn_snapshot_defer_time;
+extern int csn_time_shift;
+
+
+extern Size CSNSnapshotShmemSize(void);
+extern void CSNSnapshotShmemInit(void);
+extern void CSNSnapshotStartup(TransactionId oldestActiveXID);
+
+extern void CSNSnapshotMapXmin(SnapshotCSN snapshot_csn);
+extern TransactionId CSNSnapshotToXmin(SnapshotCSN snapshot_csn);
+
+extern bool XidInCSNSnapshot(TransactionId xid, Snapshot snapshot);
+
+extern CSN TransactionIdGetCSN(TransactionId xid);
+
+extern void CSNSnapshotAbort(PGPROC *proc, TransactionId xid, int nsubxids,
+								TransactionId *subxids);
+extern void CSNSnapshotPrecommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotCommit(PGPROC *proc, TransactionId xid, int nsubxids,
+									TransactionId *subxids);
+extern void CSNSnapshotAssignCurrent(SnapshotCSN snapshot_csn);
+extern SnapshotCSN CSNSnapshotPrepareCurrent(void);
+extern void CSNSnapshotSync(SnapshotCSN remote_csn);
+
+#endif							/* CSN_SNAPSHOT_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index f582cf535f..3cf0775176 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
 PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
 PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+PG_RMGR(RM_CSNLOG_ID, "CSN", csnlog_redo, csnlog_desc, csnlog_identify, NULL, NULL, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index c0da76cab4..2ee489dcad 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -242,6 +242,7 @@ typedef struct xl_parameter_change
 	int			wal_level;
 	bool		wal_log_hints;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 } xl_parameter_change;
 
 /* logs restore point */
@@ -332,5 +333,6 @@ extern bool ArchiveRecoveryRequested;
 extern bool InArchiveRecovery;
 extern bool StandbyMode;
 extern char *recoveryRestoreCommand;
+extern bool enable_csn_wal;
 
 #endif							/* XLOG_INTERNAL_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 749bce0cc6..a7da532f3a 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -183,6 +183,7 @@ typedef struct ControlFileData
 	int			max_prepared_xacts;
 	int			max_locks_per_xact;
 	bool		track_commit_timestamp;
+	bool		enable_csn_snapshot;
 
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d068d6532e..d578aceb40 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11689,4 +11689,21 @@
   prorettype => 'bytea', proargtypes => 'pg_brin_minmax_multi_summary',
   prosrc => 'brin_minmax_multi_summary_send' },
 
+# csn shnapshot handling
+{ oid => '10001', descr => 'export csn snapshot',
+  proname => 'pg_csn_snapshot_export', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_csn_snapshot_export' },
+{ oid => '10002', descr => 'import csn snapshot',
+  proname => 'pg_csn_snapshot_import', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'int8', prosrc => 'pg_csn_snapshot_import' },
+{ oid => '10003', descr => 'prepare distributed transaction for commit, get csn',
+  proname => 'pg_csn_snapshot_prepare', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => 'text', prosrc => 'pg_csn_snapshot_prepare' },
+{ oid => '10004', descr => 'assign csn to distributed transaction',
+  proname => 'pg_csn_snapshot_assign', provolatile => 'v', proparallel => 'u',
+  prorettype => 'void', proargtypes => 'text int8', prosrc => 'pg_csn_snapshot_assign' },
+{ oid => '10005', descr => 'get current CSN',
+  proname => 'pg_current_csn', provolatile => 'v', proparallel => 'u',
+  prorettype => 'int8', proargtypes => '', prosrc => 'pg_current_csn' },
+
 ]
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 99873497a6..8d1ced7430 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -93,6 +93,9 @@ typedef struct
 #define USECS_PER_MINUTE INT64CONST(60000000)
 #define USECS_PER_SEC	INT64CONST(1000000)
 
+#define NSECS_PER_SEC	INT64CONST(1000000000)
+#define NSECS_PER_USEC	INT64CONST(1000)
+
 /*
  * We allow numeric timezone offsets up to 15:59:59 either way from Greenwich.
  * Currently, the record holders for wackiest offsets in actual use are zones
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index ab7b85c86e..f08999740b 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -281,6 +281,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_GETARG_FLOAT4(n)  DatumGetFloat4(PG_GETARG_DATUM(n))
 #define PG_GETARG_FLOAT8(n)  DatumGetFloat8(PG_GETARG_DATUM(n))
 #define PG_GETARG_INT64(n)	 DatumGetInt64(PG_GETARG_DATUM(n))
+#define PG_GETARG_UINT64(n)	 DatumGetUInt64(PG_GETARG_DATUM(n))
 /* use this if you want the raw, possibly-toasted input datum: */
 #define PG_GETARG_RAW_VARLENA_P(n)	((struct varlena *) PG_GETARG_POINTER(n))
 /* use this if you want the input datum de-toasted: */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 39a4f0600e..a78f0d284b 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,6 +141,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* !HAVE_CLOCK_GETTIME */
 
 /* Use gettimeofday() */
@@ -205,6 +208,10 @@ typedef struct timeval instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + \
+		(uint64) (t).tv_usec * (uint64) 1000)
+
 #endif							/* HAVE_CLOCK_GETTIME */
 
 #else							/* WIN32 */
@@ -237,6 +244,9 @@ typedef LARGE_INTEGER instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	((uint64) (((double) (t).QuadPart * 1000000.0) / GetTimerFrequency()))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	((uint64) (((double) (t).QuadPart * 1000000000.0) / GetTimerFrequency()))
+
 static inline double
 GetTimerFrequency(void)
 {
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a8f052e484..65d1e49fb2 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -168,6 +168,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFER,
 	LWTRANCHE_SUBTRANS_BUFFER,
+	LWTRANCHE_CSN_LOG_BUFFERS,
 	LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 	LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 	LWTRANCHE_NOTIFY_BUFFER,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index be67d8a861..ade5d8e169 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -15,12 +15,14 @@
 #define _PROC_H_
 
 #include "access/clog.h"
+#include "access/csn_snapshot.h"
 #include "access/xlogdefs.h"
 #include "lib/ilist.h"
 #include "storage/latch.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "utils/snapshot.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
@@ -251,6 +253,18 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/*
+	 * assignedCSN holds CSN for this transaction.  It is generated
+	 * under a ProcArray lock and later is written to a CSNLog.  This
+	 * variable defined as atomic only for case of group commit, in all other
+	 * scenarios only backend responsible for this proc entry is working with
+	 * this variable.
+	 */
+	CSN_atomic assignedCSN;
+
+	/* Original xmin of this backend before csn snapshot was imported */
+	TransactionId originalXmin;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index b01fa52139..ba580435f9 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -20,6 +20,10 @@
 #include "utils/snapshot.h"
 
 
+#define		PROCARRAY_NON_IMPORTED_XMIN		0x80	/* use originalXmin instead
+													 * of xmin to properly
+													 * maintain csnXidMap */
+
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
 extern void ProcArrayAdd(PGPROC *proc);
@@ -94,4 +98,7 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
 
+extern void ProcArraySetCSNSnapshotXmin(TransactionId xmin);
+
+extern TransactionId ProcArrayGetCSNSnapshotXmin(void);
 #endif							/* PROCARRAY_H */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 6fd50cfa7b..eb1d52673a 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -39,6 +39,7 @@ typedef enum SyncRequestHandler
 	SYNC_HANDLER_COMMIT_TS,
 	SYNC_HANDLER_MULTIXACT_OFFSET,
 	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_CSN,
 	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index c6a176cc95..122eea20ba 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -41,10 +41,11 @@
   && !RelationIsAccessibleInLogicalDecoding(rel) \
 )
 
-#define EarlyPruningEnabled(rel) (old_snapshot_threshold >= 0 && RelationAllowsEarlyPruning(rel))
+#define EarlyPruningEnabled(rel) (old_snapshot_threshold >= 0 && !enable_csn_snapshot && RelationAllowsEarlyPruning(rel))
 
 /* GUC variables */
 extern PGDLLIMPORT int old_snapshot_threshold;
+extern PGDLLIMPORT bool enable_csn_snapshot;
 
 
 extern Size SnapMgrShmemSize(void);
@@ -100,7 +101,7 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 static inline bool
 OldSnapshotThresholdActive(void)
 {
-	return old_snapshot_threshold >= 0;
+	return (old_snapshot_threshold >= 0) && (!enable_csn_snapshot);
 }
 
 extern Snapshot GetTransactionSnapshot(void);
@@ -130,6 +131,8 @@ extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
 
+extern SnapshotCSN ExportCSNSnapshot(void);
+extern void ImportCSNSnapshot(SnapshotCSN snapshot_csn);
 extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 6b60755c53..3580a94c43 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -121,6 +121,9 @@ typedef enum SnapshotType
 typedef struct SnapshotData *Snapshot;
 
 #define InvalidSnapshot		((Snapshot) NULL)
+#define InvalidCSN			((CSN) 0)
+typedef uint64 CSN;
+typedef uint64 SnapshotCSN;
 
 /*
  * Struct representing all kind of possible snapshots.
@@ -214,6 +217,14 @@ typedef struct SnapshotData
 	 * transactions completed since the last GetSnapshotData().
 	 */
 	uint64		snapXactCompletionCount;
+
+	/*
+	 * SnapshotCSN for snapshot isolation support.
+	 * Will be used only if enable_csn_snapshot is enabled.
+	 */
+	SnapshotCSN	snapshot_csn;
+	/* Did we have our own snapshot_csn or imported one from different node */
+	bool		imported_csn;
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index dffc79b2d9..16bb65e7e1 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
 SUBDIRS = \
 		  brin \
 		  commit_ts \
+		  csnsnapshot \
 		  delay_execution \
 		  dummy_index_am \
 		  dummy_seclabel \
diff --git a/src/test/modules/csnsnapshot/Makefile b/src/test/modules/csnsnapshot/Makefile
new file mode 100644
index 0000000000..15a07f8846
--- /dev/null
+++ b/src/test/modules/csnsnapshot/Makefile
@@ -0,0 +1,22 @@
+# src/test/modules/csnsnapshot/Makefile
+
+NO_INSTALLCHECK = 1
+
+TAP_TESTS = 1
+
+# Doesn't support full consistency of distributed commit in READ COMMITTED
+# transactions.
+PROVE_TESTS =	t/001_base.pl \
+				t/002_standby.pl \
+				t/003_parallel_safe.pl
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/csnsnapshot
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/csnsnapshot/expected/csnsnapshot.out b/src/test/modules/csnsnapshot/expected/csnsnapshot.out
new file mode 100644
index 0000000000..ac28e417b6
--- /dev/null
+++ b/src/test/modules/csnsnapshot/expected/csnsnapshot.out
@@ -0,0 +1 @@
+create table t1(i int, j int, k varchar);
diff --git a/src/test/modules/csnsnapshot/t/001_base.pl b/src/test/modules/csnsnapshot/t/001_base.pl
new file mode 100644
index 0000000000..b81419512e
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/001_base.pl
@@ -0,0 +1,100 @@
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More tests => 5;
+
+my ($node, $test_snapshot, $count1, $count2);
+$node = PostgreSQL::Test::Cluster->new('csntest');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					max_prepared_transactions = 10
+					});
+$node->start;
+
+# Create a table
+$node->safe_psql('postgres', 'create table t1(i int, j int)');
+
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(1,1)');
+# export csn snapshot
+$test_snapshot = $node->safe_psql('postgres', 'select pg_csn_snapshot_export()');
+# insert test record
+$node->safe_psql('postgres', 'insert into t1 values(2,1)');
+
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '2', 'Get right number in normal query');
+$count2 = $node->safe_psql('postgres', "
+			begin transaction isolation level repeatable read;
+			select pg_csn_snapshot_import($test_snapshot);
+			select count(*) from t1;
+			commit;"
+			);
+
+is($count2, '
+1', 'Get right number in csn import query');
+
+#prepare transaction test
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(3,1);
+						insert into t1 values(3,2);
+						prepare	transaction 'pt3';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(4,1);
+						insert into t1 values(4,2);
+						prepare	transaction 'pt4';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(5,1);
+						insert into t1 values(5,2);
+						prepare	transaction 'pt5';
+						");
+$node->safe_psql('postgres', "
+						begin;
+						insert into t1 values(6,1);
+						insert into t1 values(6,2);
+						prepare	transaction 'pt6';
+						");
+$node->safe_psql('postgres', "commit prepared 'pt4';");
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = off");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(7,1);
+						insert into t1 values(7,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt3';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '8', 'Get right number in normal query');
+
+
+# restart with enable_csn_snapshot on
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(8,1);
+						insert into t1 values(8,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt5';");
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '12', 'Get right number in normal query');
+
+# restart with enable_csn_snapshot off
+$node->append_conf('postgresql.conf', "enable_csn_snapshot = on");
+$node->restart;
+$node->safe_psql('postgres', "
+						insert into t1 values(9,1);
+						insert into t1 values(9,2);
+						");
+$node->safe_psql('postgres', "commit prepared 'pt6';");
+
+$count1 = $node->safe_psql('postgres', "select count(*) from t1");
+is($count1, '16', 'Get right number in normal query');
diff --git a/src/test/modules/csnsnapshot/t/002_standby.pl b/src/test/modules/csnsnapshot/t/002_standby.pl
new file mode 100644
index 0000000000..27fcbb8f8a
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/002_standby.pl
@@ -0,0 +1,68 @@
+# Test simple scenario involving a standby
+
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More tests => 6;
+
+my ($master, $bkplabel, $standby, $guc_on_master, $guc_on_standby);
+
+$bkplabel = 'backup';
+$master = PostgreSQL::Test::Cluster->new('master');
+$master->init(allows_streaming => 1);
+
+$master->append_conf(
+	'postgresql.conf', qq{
+	enable_csn_snapshot = on
+	max_wal_senders = 5
+	});
+$master->start;
+$master->backup($bkplabel);
+
+$standby = PostgreSQL::Test::Cluster->new('standby');
+$standby->init_from_backup($master, $bkplabel, has_streaming => 1);
+$standby->start;
+
+$master->safe_psql('postgres', "create table t1(i int, j int)");
+
+$guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'on', "GUC on master");
+
+$guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$guc_on_master = $master->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_master, 'off', "GUC off master");
+
+$guc_on_standby = $standby->safe_psql('postgres', 'show enable_csn_snapshot');
+is($guc_on_standby, 'on', "GUC on standby");
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = on');
+$master->restart;
+
+my $count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '4096', "Ok for siwtch xid-base > csn-base"); #4096
+
+# We consume a large number of transaction,for skip page
+for my $i (1 .. 4096) #4096
+{
+	$master->safe_psql('postgres', "insert into t1 values(1,$i)");
+}
+$master->safe_psql('postgres', "select pg_sleep(2)");
+
+$master->append_conf('postgresql.conf', 'enable_csn_snapshot = off');
+$master->restart;
+
+$count_standby = $standby->safe_psql('postgres', 'select count(*) from t1');
+is($count_standby, '8192', "Ok for switch csn-base > xid-base"); #8192
\ No newline at end of file
diff --git a/src/test/modules/csnsnapshot/t/003_parallel_safe.pl b/src/test/modules/csnsnapshot/t/003_parallel_safe.pl
new file mode 100644
index 0000000000..e303e3f1a6
--- /dev/null
+++ b/src/test/modules/csnsnapshot/t/003_parallel_safe.pl
@@ -0,0 +1,67 @@
+# Check safety of CSN machinery for parallel mode.
+
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More tests => 2;
+
+my ($node, $updScr, $selScr, $started, $pgb_handle1, $result, $errors);
+
+$node = PostgreSQL::Test::Cluster->new('csntest');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+					enable_csn_snapshot = on
+					csn_snapshot_defer_time = 10
+					default_transaction_isolation = 'REPEATABLE READ'
+
+					# force parallel mode.
+					max_worker_processes = 64
+					max_parallel_workers_per_gather = 16
+					max_parallel_workers = 32
+					parallel_setup_cost = 1
+					parallel_tuple_cost = 0.05
+					min_parallel_table_scan_size = 0
+});
+$node->start;
+
+$node->command_ok([ 'pgbench', '-i', '-s', '1' ], "pgbench initialization ok");
+$node->safe_psql('postgres', qq{
+	CREATE OR REPLACE FUNCTION cnt() RETURNS integer AS '
+		SELECT sum(abalance) FROM pgbench_accounts;
+	' LANGUAGE SQL PARALLEL SAFE COST 100000.;
+});
+
+
+$updScr = File::Temp->new();
+append_to_file($updScr, q{
+	UPDATE pgbench_accounts SET abalance = abalance + 1 WHERE aid = 1;
+});
+
+$selScr = '
+	SELECT count(*) AS res FROM (
+		SELECT cnt() AS y FROM pgbench_accounts WHERE aid < 20
+		GROUP BY (y)
+	) AS q;
+';
+
+# Launch updates
+$pgb_handle1 = $node->pgbench_async(-n, -T => 10, -f => $updScr, 'postgres' );
+
+$errors = 0;
+$started = time();
+while (time() - $started < 10)
+{
+	# Check that each worker returns the same sum on balance column.
+	$result = $node->safe_psql('postgres', $selScr);
+	if ($result ne 1)
+	{
+		$errors++;
+		diag("Workers returned different sums: $result");
+	}
+}
+is($errors, 0, 'isolation between UPDATE and concurrent SELECT workers.');
+
+$node->pgbench_await($pgb_handle1);
+$node->stop();
\ No newline at end of file
diff --git a/src/test/modules/snapshot_too_old/sto.conf b/src/test/modules/snapshot_too_old/sto.conf
index 7eeaeeb0dc..3177cc0e15 100644
--- a/src/test/modules/snapshot_too_old/sto.conf
+++ b/src/test/modules/snapshot_too_old/sto.conf
@@ -1,2 +1,3 @@
 autovacuum = off
 old_snapshot_threshold = 0
+enable_csn_snapshot = false
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 9467a199c8..9d38a4922d 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -2068,6 +2068,34 @@ sub pgbench
 	$self->command_checks_all(\@cmd, $stat, $out, $err, $name);
 }
 
+sub pgbench_async()
+{
+	my ($self, @args) = @_;
+
+	my ($in, $out, $err, $rc);
+	$in = '';
+	$out = '';
+
+	my @pgbench_command = (
+		'pgbench',
+		-h => $self->host,
+		-p => $self->port,
+		@args
+	);
+	my $handle = IPC::Run::start(\@pgbench_command, $in, $out);
+	return $handle;
+}
+
+sub pgbench_await()
+{
+	my ($self, $pgbench_handle) = @_;
+
+	# During run some pgbench threads can exit (for example due to
+	# serialization error). That will set non-zero returning code.
+	# So don't check return code here and leave it to a caller.
+	my $rc = IPC::Run::finish($pgbench_handle);
+}
+
 =pod
 
 =item $node->connect_ok($connstr, $test_name, %params)
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 2088857615..010b3b3144 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -104,6 +104,8 @@ select name, setting from pg_settings where name like 'enable%';
 --------------------------------+---------
  enable_async_append            | on
  enable_bitmapscan              | on
+ enable_csn_snapshot            | on
+ enable_csn_wal                 | on
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
@@ -122,7 +124,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(20 rows)
+(22 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
-- 
2.25.1