>From 0a15a118d9b88a3e327cf76dfe297c17bf17fb01 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 19 Aug 2013 13:24:30 +0200
Subject: [PATCH 4/8] wal_decoding: Introduce wal decoding via catalog
 timetravel

This introduces several things:
* 'reorderbuffer' module which reassembles transactions from a stream of interspersed changes
* 'snapbuilder' which builds catalog snapshots so that tuples from wal can be understood
* logging more data into wal to facilitate logical decoding
* wal decoding into an reorderbuffer
* shared library output plugins with 5 callbacks
 * init
 * begin
 * change
 * commit
* walsender infrastructur to stream out changes and to keep the global xmin low enough
 * INIT_LOGICAL_REPLICATION $plugin; waits till a consistent snapshot is built and returns
   * initial LSN
   * replication slot identifier
   * id of a pg_export() style snapshot
 * START_LOGICAL_REPLICATION $id $lsn; streams out changes
 * uses named output plugins for output specification

Todo:
* better integrated testing infrastructure
* more docs about the internals

Lowlevel:
* resource owner handling is suboptimal
* invalidations from uninteresting transactions (e.g. from other databases, old ones)
  need to be processed anyway
* error handling in walsender is suboptimal
* pg_receivellog needs to send a reply immediately when postgres is shutting down

Input, Testing and Review by:
Heikki Linnakangas
Kevin Grittner
Michael Paquier
Abhijit Menon-Sen
Peter Gheogegan
Robert Haas
Simon Riggs
Steve Singer

Code By:
Andres Freund

With code contributions by:
Abhijit Menon-Sen
Craig Ringer
Alvaro Herrera

Conflicts:
	src/backend/replication/repl_gram.y
---
 src/backend/access/common/reloptions.c          |   10 +
 src/backend/access/heap/heapam.c                |  465 ++++-
 src/backend/access/heap/pruneheap.c             |    2 +
 src/backend/access/index/indexam.c              |   14 +-
 src/backend/access/rmgrdesc/heapdesc.c          |    9 +
 src/backend/access/rmgrdesc/xlogdesc.c          |    1 +
 src/backend/access/transam/twophase.c           |    4 +-
 src/backend/access/transam/xact.c               |   48 +-
 src/backend/access/transam/xlog.c               |   14 +-
 src/backend/catalog/catalog.c                   |   14 +-
 src/backend/catalog/index.c                     |   15 +-
 src/backend/catalog/system_views.sql            |   10 +
 src/backend/commands/analyze.c                  |    2 +-
 src/backend/commands/cluster.c                  |    2 +
 src/backend/commands/trigger.c                  |    3 +-
 src/backend/commands/vacuum.c                   |    5 +-
 src/backend/commands/vacuumlazy.c               |    3 +
 src/backend/postmaster/postmaster.c             |    2 +-
 src/backend/replication/Makefile                |    2 +
 src/backend/replication/logical/Makefile        |   19 +
 src/backend/replication/logical/decode.c        |  687 ++++++
 src/backend/replication/logical/logical.c       | 1046 ++++++++++
 src/backend/replication/logical/logicalfuncs.c  |  361 ++++
 src/backend/replication/logical/reorderbuffer.c | 2548 +++++++++++++++++++++++
 src/backend/replication/logical/snapbuild.c     | 1581 ++++++++++++++
 src/backend/replication/repl_gram.y             |   75 +-
 src/backend/replication/repl_scanner.l          |   55 +-
 src/backend/replication/walreceiver.c           |    2 +-
 src/backend/replication/walsender.c             |  733 ++++++-
 src/backend/storage/ipc/ipci.c                  |    3 +
 src/backend/storage/ipc/procarray.c             |   72 +-
 src/backend/storage/ipc/standby.c               |   15 +
 src/backend/utils/cache/inval.c                 |    4 +-
 src/backend/utils/cache/relcache.c              |  113 +-
 src/backend/utils/misc/guc.c                    |   12 +
 src/backend/utils/misc/postgresql.conf.sample   |   11 +-
 src/backend/utils/time/snapmgr.c                |    7 +-
 src/backend/utils/time/tqual.c                  |  270 ++-
 src/bin/initdb/initdb.c                         |    4 +-
 src/bin/pg_controldata/pg_controldata.c         |    2 +
 src/include/access/heapam_xlog.h                |   59 +-
 src/include/access/transam.h                    |    5 +
 src/include/access/xact.h                       |    1 +
 src/include/access/xlog.h                       |    8 +-
 src/include/access/xlogreader.h                 |   13 +-
 src/include/catalog/catalog.h                   |    1 +
 src/include/catalog/pg_proc.h                   |    6 +
 src/include/commands/vacuum.h                   |    2 +-
 src/include/nodes/nodes.h                       |    3 +
 src/include/nodes/replnodes.h                   |   35 +
 src/include/replication/decode.h                |   20 +
 src/include/replication/logical.h               |  198 ++
 src/include/replication/logicalfuncs.h          |   21 +
 src/include/replication/output_plugin.h         |   70 +
 src/include/replication/reorderbuffer.h         |  342 +++
 src/include/replication/snapbuild.h             |   81 +
 src/include/replication/walsender_private.h     |    6 +-
 src/include/storage/itemptr.h                   |    3 +
 src/include/storage/lwlock.h                    |    1 +
 src/include/storage/procarray.h                 |    2 +-
 src/include/storage/sinval.h                    |    2 +
 src/include/utils/inval.h                       |    1 +
 src/include/utils/rel.h                         |   30 +-
 src/include/utils/relcache.h                    |   11 +-
 src/include/utils/snapmgr.h                     |    3 +
 src/include/utils/tqual.h                       |   21 +-
 src/test/regress/expected/rules.out             |    9 +-
 src/tools/pgindent/typedefs.list                |   40 +
 68 files changed, 9033 insertions(+), 206 deletions(-)
 create mode 100644 src/backend/replication/logical/Makefile
 create mode 100644 src/backend/replication/logical/decode.c
 create mode 100644 src/backend/replication/logical/logical.c
 create mode 100644 src/backend/replication/logical/logicalfuncs.c
 create mode 100644 src/backend/replication/logical/reorderbuffer.c
 create mode 100644 src/backend/replication/logical/snapbuild.c
 create mode 100644 src/include/replication/decode.h
 create mode 100644 src/include/replication/logical.h
 create mode 100644 src/include/replication/logicalfuncs.h
 create mode 100644 src/include/replication/output_plugin.h
 create mode 100644 src/include/replication/reorderbuffer.h
 create mode 100644 src/include/replication/snapbuild.h

diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index b5fd30a..e1e5040 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -63,6 +63,14 @@ static relopt_bool boolRelOpts[] =
 	},
 	{
 		{
+			"treat_as_catalog_table",
+			"Treat table as a catalog table for the purpose of logical replication",
+			RELOPT_KIND_HEAP
+		},
+		false
+	},
+	{
+		{
 			"fastupdate",
 			"Enables \"fast update\" feature for this GIN index",
 			RELOPT_KIND_GIN
@@ -1166,6 +1174,8 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		offsetof(StdRdOptions, security_barrier)},
 		{"check_option", RELOPT_TYPE_STRING,
 		offsetof(StdRdOptions, check_option_offset)},
+		{"treat_as_catalog_table", RELOPT_TYPE_BOOL,
+		 offsetof(StdRdOptions, treat_as_catalog_table)}
 	};
 
 	options = parseRelOptions(reloptions, validate, kind, &numoptions);
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ead3d69..1a7281f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -85,12 +85,14 @@ static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
 					TransactionId xid, CommandId cid, int options);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 				Buffer newbuf, HeapTuple oldtup,
-				HeapTuple newtup, bool all_visible_cleared,
-				bool new_all_visible_cleared);
+				HeapTuple newtup, HeapTuple old_idx_tup,
+				bool all_visible_cleared, bool new_all_visible_cleared);
 static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
-							 Bitmapset *hot_attrs, Bitmapset *key_attrs,
-							 bool *satisfies_hot, bool *satisfies_key,
-							 HeapTuple oldtup, HeapTuple newtup);
+						  Bitmapset *hot_attrs,
+						  Bitmapset *key_attrs, Bitmapset *ckey_attrs,
+						  bool *satisfies_hot, bool *satisfies_key,
+						  bool *satisfies_ckey,
+						  HeapTuple oldtup, HeapTuple newtup);
 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
 						  uint16 old_infomask2, TransactionId add_to_xmax,
 						  LockTupleMode mode, bool is_update,
@@ -108,6 +110,8 @@ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 static bool ConditionalMultiXactIdWait(MultiXactId multi,
 						   MultiXactStatus status, int *remaining,
 						   uint16 infomask);
+static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
+static HeapTuple ExtractKeyTuple(Relation rel, HeapTuple tup);
 
 
 /*
@@ -342,8 +346,10 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
-	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	if (IsSystemRelation(scan->rs_rd) || RelationIsDoingTimetravel(scan->rs_rd))
+		heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	else
+		heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalDataXmin);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
@@ -1743,10 +1749,16 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 */
 		if (!skip)
 		{
+			/* setup the redirected t_self for the benefit of timetravel access */
+			ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
+
 			/* If it's visible per the snapshot, we must return it */
 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
 			CheckForSerializableConflictOut(valid, relation, heapTuple,
 											buffer, snapshot);
+			/* reset original, non-redirected, tid */
+			heapTuple->t_self = *tid;
+
 			if (valid)
 			{
 				ItemPointerSetOffsetNumber(tid, offnum);
@@ -2101,11 +2113,24 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
 		XLogRecPtr	recptr;
-		XLogRecData rdata[3];
+		XLogRecData rdata[4];
 		Page		page = BufferGetPage(buffer);
 		uint8		info = XLOG_HEAP_INSERT;
+		bool		need_tuple_data;
+
+		/*
+		 * For logical replication, we need the tuple even if we're doing a
+		 * full page write, so make sure to log it separately. (XXX We could
+		 * alternatively store a pointer into the FPW).
+		 *
+		 * Also, if this is a catalog, we need to transmit combocids to
+		 * properly decode, so log that as well.
+		 */
+		need_tuple_data = RelationIsLogicallyLogged(relation);
+		if (RelationIsDoingTimetravel(relation))
+			log_heap_new_cid(relation, heaptup);
 
-		xlrec.all_visible_cleared = all_visible_cleared;
+		xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0;
 		xlrec.target.node = relation->rd_node;
 		xlrec.target.tid = heaptup->t_self;
 		rdata[0].data = (char *) &xlrec;
@@ -2124,18 +2149,35 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		 */
 		rdata[1].data = (char *) &xlhdr;
 		rdata[1].len = SizeOfHeapHeader;
-		rdata[1].buffer = buffer;
+		rdata[1].buffer = need_tuple_data ? InvalidBuffer : buffer;
 		rdata[1].buffer_std = true;
 		rdata[1].next = &(rdata[2]);
 
 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
 		rdata[2].data = (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits);
 		rdata[2].len = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits);
-		rdata[2].buffer = buffer;
+		rdata[2].buffer = need_tuple_data ? InvalidBuffer : buffer;
 		rdata[2].buffer_std = true;
 		rdata[2].next = NULL;
 
 		/*
+		 * add record for the buffer without actual content thats removed if
+		 * fpw is done for that buffer
+		 */
+		if (need_tuple_data)
+		{
+			rdata[2].next = &(rdata[3]);
+
+			rdata[3].data = NULL;
+			rdata[3].len = 0;
+			rdata[3].buffer = buffer;
+			rdata[3].buffer_std = true;
+			rdata[3].next = NULL;
+
+			xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE;
+		}
+
+		/*
 		 * If this is the single and first tuple on page, we can reinit the
 		 * page instead of restoring the whole thing.  Set flag, and hide
 		 * buffer references from XLogInsert.
@@ -2144,7 +2186,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
 		{
 			info |= XLOG_HEAP_INIT_PAGE;
-			rdata[1].buffer = rdata[2].buffer = InvalidBuffer;
+			rdata[1].buffer = rdata[2].buffer = rdata[3].buffer = InvalidBuffer;
 		}
 
 		recptr = XLogInsert(RM_HEAP_ID, info, rdata);
@@ -2270,6 +2312,8 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 	Page		page;
 	bool		needwal;
 	Size		saveFreeSpace;
+	bool        need_tuple_data = RelationIsLogicallyLogged(relation);
+	bool        need_cids = RelationIsDoingTimetravel(relation);
 
 	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
@@ -2356,7 +2400,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 		{
 			XLogRecPtr	recptr;
 			xl_heap_multi_insert *xlrec;
-			XLogRecData rdata[2];
+			XLogRecData rdata[3];
 			uint8		info = XLOG_HEAP2_MULTI_INSERT;
 			char	   *tupledata;
 			int			totaldatalen;
@@ -2386,7 +2430,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 			/* the rest of the scratch space is used for tuple data */
 			tupledata = scratchptr;
 
-			xlrec->all_visible_cleared = all_visible_cleared;
+			xlrec->flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0;
 			xlrec->node = relation->rd_node;
 			xlrec->blkno = BufferGetBlockNumber(buffer);
 			xlrec->ntuples = nthispage;
@@ -2418,6 +2462,13 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 					   datalen);
 				tuphdr->datalen = datalen;
 				scratchptr += datalen;
+
+				/*
+				 * We don't use heap_multi_insert for catalog tuples yet, but
+				 * better be prepared...
+				 */
+				if (need_cids)
+					log_heap_new_cid(relation, heaptup);
 			}
 			totaldatalen = scratchptr - tupledata;
 			Assert((scratchptr - scratch) < BLCKSZ);
@@ -2429,17 +2480,33 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 
 			rdata[1].data = tupledata;
 			rdata[1].len = totaldatalen;
-			rdata[1].buffer = buffer;
+			rdata[1].buffer = need_tuple_data ? InvalidBuffer : buffer;
 			rdata[1].buffer_std = true;
 			rdata[1].next = NULL;
 
 			/*
+			 * add record for the buffer without actual content thats removed if
+			 * fpw is done for that buffer
+			 */
+			if (need_tuple_data)
+			{
+				rdata[1].next = &(rdata[2]);
+
+				rdata[2].data = NULL;
+				rdata[2].len = 0;
+				rdata[2].buffer = buffer;
+				rdata[2].buffer_std = true;
+				rdata[2].next = NULL;
+				xlrec->flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE;
+			}
+
+			/*
 			 * If we're going to reinitialize the whole page using the WAL
 			 * record, hide buffer reference from XLogInsert.
 			 */
 			if (init)
 			{
-				rdata[1].buffer = InvalidBuffer;
+				rdata[1].buffer = rdata[2].buffer = InvalidBuffer;
 				info |= XLOG_HEAP_INIT_PAGE;
 			}
 
@@ -2559,6 +2626,9 @@ heap_delete(Relation relation, ItemPointer tid,
 	bool		have_tuple_lock = false;
 	bool		iscombo;
 	bool		all_visible_cleared = false;
+	bool		need_tuple_data = RelationNeedsWAL(relation) &&
+		RelationIsLogicallyLogged(relation);
+	HeapTuple idx_tuple = NULL; /* primary key of the tuple */
 
 	Assert(ItemPointerIsValid(tid));
 
@@ -2732,6 +2802,15 @@ l1:
 	/* replace cid with a combo cid if necessary */
 	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
 
+	/*
+	 * Compute primary key tuple before entering the critical section so we
+	 * don't PANIC uppon a memory allocation failure.
+	 */
+	if (need_tuple_data)
+	{
+		idx_tuple = ExtractKeyTuple(relation, &tp);
+	}
+
 	START_CRIT_SECTION();
 
 	/*
@@ -2784,9 +2863,13 @@ l1:
 	{
 		xl_heap_delete xlrec;
 		XLogRecPtr	recptr;
-		XLogRecData rdata[2];
+		XLogRecData rdata[4];
+
+		/* For logical decode we need combocids to properly decode the catalog */
+		if (RelationIsDoingTimetravel(relation))
+			log_heap_new_cid(relation, &tp);
 
-		xlrec.all_visible_cleared = all_visible_cleared;
+		xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0;
 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
 											  tp.t_data->t_infomask2);
 		xlrec.target.node = relation->rd_node;
@@ -2803,6 +2886,34 @@ l1:
 		rdata[1].buffer_std = true;
 		rdata[1].next = NULL;
 
+		/*
+		 * Log primary key of the deleted tuple
+		 */
+		if (need_tuple_data && idx_tuple != NULL)
+		{
+			xl_heap_header xlhdr;
+
+			xlhdr.t_infomask2 = idx_tuple->t_data->t_infomask2;
+			xlhdr.t_infomask = idx_tuple->t_data->t_infomask;
+			xlhdr.t_hoff = idx_tuple->t_data->t_hoff;
+
+			rdata[1].next = &(rdata[2]);
+			rdata[2].data = (char*)&xlhdr;
+			rdata[2].len = SizeOfHeapHeader;
+			rdata[2].buffer = InvalidBuffer;
+			rdata[2].next = NULL;
+
+			rdata[2].next = &(rdata[3]);
+			rdata[3].data = (char *) idx_tuple->t_data
+				+ offsetof(HeapTupleHeaderData, t_bits);
+			rdata[3].len = idx_tuple->t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+			rdata[3].buffer = InvalidBuffer;
+			rdata[3].next = NULL;
+
+			xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY;
+		}
+
 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE, rdata);
 
 		PageSetLSN(page, recptr);
@@ -2932,9 +3043,11 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	TransactionId xid = GetCurrentTransactionId();
 	Bitmapset  *hot_attrs;
 	Bitmapset  *key_attrs;
+	Bitmapset  *ckey_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
+	HeapTuple	old_idx_tuple = NULL;
 	Page		page;
 	BlockNumber block;
 	MultiXactStatus mxact_status;
@@ -2950,6 +3063,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	bool		iscombo;
 	bool		satisfies_hot;
 	bool		satisfies_key;
+	bool		satisfies_ckey;
 	bool		use_hot_update = false;
 	bool		key_intact;
 	bool		all_visible_cleared = false;
@@ -2977,8 +3091,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 * Note that we get a copy here, so we need not worry about relcache flush
 	 * happening midway through.
 	 */
-	hot_attrs = RelationGetIndexAttrBitmap(relation, false);
-	key_attrs = RelationGetIndexAttrBitmap(relation, true);
+	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
+	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
+	ckey_attrs = RelationGetIndexAttrBitmap(relation,
+										   INDEX_ATTR_BITMAP_CANDIDATE_KEY);
 
 	block = ItemPointerGetBlockNumber(otid);
 	buffer = ReadBuffer(relation, block);
@@ -3036,9 +3152,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 * is updates that don't manipulate key columns, not those that
 	 * serendipitiously arrive at the same key values.
 	 */
-	HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs,
+	HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, ckey_attrs,
 								 &satisfies_hot, &satisfies_key,
-								 &oldtup, newtup);
+								 &satisfies_ckey, &oldtup, newtup);
 	if (satisfies_key)
 	{
 		*lockmode = LockTupleNoKeyExclusive;
@@ -3508,6 +3624,12 @@ l2:
 		PageSetFull(page);
 	}
 
+	/* compute tuple for loggical logging */
+	if (!satisfies_ckey && RelationIsLogicallyLogged(relation))
+	{
+		old_idx_tuple = ExtractKeyTuple(relation, &oldtup);
+	}
+
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
 
@@ -3583,11 +3705,20 @@ l2:
 	/* XLOG stuff */
 	if (RelationNeedsWAL(relation))
 	{
-		XLogRecPtr	recptr = log_heap_update(relation, buffer,
-											 newbuf, &oldtup, heaptup,
-											 all_visible_cleared,
-											 all_visible_cleared_new);
+		XLogRecPtr	recptr;
+
+		/* For logical decode we need combocids to properly decode the catalog */
+		if (RelationIsDoingTimetravel(relation))
+		{
+			log_heap_new_cid(relation, &oldtup);
+			log_heap_new_cid(relation, heaptup);
+		}
 
+		recptr = log_heap_update(relation, buffer,
+								 newbuf, &oldtup, heaptup,
+								 old_idx_tuple,
+								 all_visible_cleared,
+								 all_visible_cleared_new);
 		if (newbuf != buffer)
 		{
 			PageSetLSN(BufferGetPage(newbuf), recptr);
@@ -3739,18 +3870,23 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
  * modify columns used in the key.
  */
 static void
-HeapSatisfiesHOTandKeyUpdate(Relation relation,
-							 Bitmapset *hot_attrs, Bitmapset *key_attrs,
+HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs,
+							 Bitmapset *key_attrs, Bitmapset *ckey_attrs,
 							 bool *satisfies_hot, bool *satisfies_key,
+							 bool *satisfies_ckey,
 							 HeapTuple oldtup, HeapTuple newtup)
 {
 	int			next_hot_attnum;
 	int			next_key_attnum;
+	int			next_ckey_attnum;
 	bool		hot_result = true;
 	bool		key_result = true;
-	bool		key_done = false;
+	bool		ckey_result = true;
 	bool		hot_done = false;
 
+	Assert(bms_is_subset(ckey_attrs, key_attrs));
+	Assert(bms_is_subset(key_attrs, hot_attrs));
+
 	next_hot_attnum = bms_first_member(hot_attrs);
 	if (next_hot_attnum == -1)
 		hot_done = true;
@@ -3759,28 +3895,25 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation,
 		next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
 
 	next_key_attnum = bms_first_member(key_attrs);
-	if (next_key_attnum == -1)
-		key_done = true;
-	else
+	if (next_key_attnum != -1)
 		/* Adjust for system attributes */
 		next_key_attnum += FirstLowInvalidHeapAttributeNumber;
 
+	next_ckey_attnum = bms_first_member(ckey_attrs);
+	if (next_ckey_attnum != -1)
+		/* Adjust for system attributes */
+		next_ckey_attnum += FirstLowInvalidHeapAttributeNumber;
+
 	for (;;)
 	{
 		int			check_now;
 		bool		changed;
 
-		/* both bitmapsets are now empty */
-		if (key_done && hot_done)
+		/* bitmapsets are now empty, hot includes others */
+		if (hot_done)
 			break;
 
-		/* XXX there's probably an easier way ... */
-		if (hot_done)
-			check_now = next_key_attnum;
-		if (key_done)
-			check_now = next_hot_attnum;
-		else
-			check_now = Min(next_hot_attnum, next_key_attnum);
+		check_now = next_hot_attnum;
 
 		changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
 										  check_now, oldtup, newtup);
@@ -3790,11 +3923,15 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation,
 				hot_result = false;
 			if (check_now == next_key_attnum)
 				key_result = false;
+			if (check_now == next_ckey_attnum)
+				ckey_result = false;
 		}
 
 		/* if both are false now, we can stop checking */
-		if (!hot_result && !key_result)
+		if (!hot_result && !key_result && !ckey_result)
+		{
 			break;
+		}
 
 		if (check_now == next_hot_attnum)
 		{
@@ -3808,16 +3945,22 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation,
 		if (check_now == next_key_attnum)
 		{
 			next_key_attnum = bms_first_member(key_attrs);
-			if (next_key_attnum == -1)
-				key_done = true;
-			else
+			if (next_key_attnum != -1)
 				/* Adjust for system attributes */
 				next_key_attnum += FirstLowInvalidHeapAttributeNumber;
 		}
+		if (check_now == next_ckey_attnum)
+		{
+			next_ckey_attnum = bms_first_member(ckey_attrs);
+			if (next_ckey_attnum != -1)
+				/* Adjust for system attributes */
+				next_ckey_attnum += FirstLowInvalidHeapAttributeNumber;
+		}
 	}
 
 	*satisfies_hot = hot_result;
 	*satisfies_key = key_result;
+	*satisfies_ckey = ckey_result;
 }
 
 /*
@@ -5839,15 +5982,22 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
 static XLogRecPtr
 log_heap_update(Relation reln, Buffer oldbuf,
 				Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
+				HeapTuple idx_tuple,
 				bool all_visible_cleared, bool new_all_visible_cleared)
 {
 	xl_heap_update xlrec;
-	xl_heap_header xlhdr;
+	xl_heap_header_len xlhdr;
+	xl_heap_header_len xlhdr_idx;
 	uint8		info;
 	XLogRecPtr	recptr;
-	XLogRecData rdata[4];
+	XLogRecData rdata[7];
 	Page		page = BufferGetPage(newbuf);
 
+	/*
+	 * Just as for XLOG_HEAP_INSERT we need to make sure the tuple
+	 */
+	bool        need_tuple_data = RelationIsLogicallyLogged(reln);
+
 	/* Caller should not call me on a non-WAL-logged relation */
 	Assert(RelationNeedsWAL(reln));
 
@@ -5862,9 +6012,12 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
 											  oldtup->t_data->t_infomask2);
 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
-	xlrec.all_visible_cleared = all_visible_cleared;
+	xlrec.flags = 0;
+	if (all_visible_cleared)
+		xlrec.flags |= XLOG_HEAP_ALL_VISIBLE_CLEARED;
 	xlrec.newtid = newtup->t_self;
-	xlrec.new_all_visible_cleared = new_all_visible_cleared;
+	if (new_all_visible_cleared)
+		xlrec.flags |= XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapUpdate;
@@ -5877,33 +6030,78 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	rdata[1].buffer_std = true;
 	rdata[1].next = &(rdata[2]);
 
-	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
-	xlhdr.t_infomask = newtup->t_data->t_infomask;
-	xlhdr.t_hoff = newtup->t_data->t_hoff;
+	xlhdr.header.t_infomask2 = newtup->t_data->t_infomask2;
+	xlhdr.header.t_infomask = newtup->t_data->t_infomask;
+	xlhdr.header.t_hoff = newtup->t_data->t_hoff;
+	xlhdr.t_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
 
-	/*
-	 * As with insert records, we need not store the rdata[2] segment if we
-	 * decide to store the whole buffer instead.
-	 */
 	rdata[2].data = (char *) &xlhdr;
-	rdata[2].len = SizeOfHeapHeader;
-	rdata[2].buffer = newbuf;
+	rdata[2].len = SizeOfHeapHeaderLen;
+	rdata[2].buffer = need_tuple_data ? InvalidBuffer : newbuf;
 	rdata[2].buffer_std = true;
 	rdata[2].next = &(rdata[3]);
 
 	/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
-	rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
+	rdata[3].data = (char *) newtup->t_data
+		+ offsetof(HeapTupleHeaderData, t_bits);
 	rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
-	rdata[3].buffer = newbuf;
+	rdata[3].buffer = need_tuple_data ? InvalidBuffer : newbuf;
 	rdata[3].buffer_std = true;
 	rdata[3].next = NULL;
 
+	/*
+	 * separate storage for the buffer reference of the new page in the
+	 * wal_level >= logical case
+	*/
+	if(need_tuple_data)
+	{
+		rdata[3].next = &(rdata[4]);
+
+		rdata[4].data = NULL,
+		rdata[4].len = 0;
+		rdata[4].buffer = newbuf;
+		rdata[4].buffer_std = true;
+		rdata[4].next = NULL;
+		xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE;
+
+		/* candidate key changed and we have a candidate key */
+		if (idx_tuple)
+		{
+			/* don't really need this, but its more comfy */
+			xlhdr_idx.header.t_infomask2 = idx_tuple->t_data->t_infomask2;
+			xlhdr_idx.header.t_infomask = idx_tuple->t_data->t_infomask;
+			xlhdr_idx.header.t_hoff = idx_tuple->t_data->t_hoff;
+			xlhdr_idx.t_len = idx_tuple->t_len;
+
+			rdata[4].next = &(rdata[5]);
+			rdata[5].data = (char *) &xlhdr_idx;
+			rdata[5].len = SizeOfHeapHeaderLen;
+			rdata[5].buffer = InvalidBuffer;
+			rdata[5].next = &(rdata[6]);
+
+			/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
+			rdata[6].data = (char *) idx_tuple->t_data
+				+ offsetof(HeapTupleHeaderData, t_bits);
+			rdata[6].len = idx_tuple->t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+			rdata[6].buffer = InvalidBuffer;
+			rdata[6].next = NULL;
+
+			xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY;
+		}
+	}
+
 	/* If new tuple is the single and first tuple on page... */
 	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
 		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
 	{
+		XLogRecData *rcur = &rdata[0];
 		info |= XLOG_HEAP_INIT_PAGE;
-		rdata[2].buffer = rdata[3].buffer = InvalidBuffer;
+		while (rcur != NULL)
+		{
+			rcur->buffer = InvalidBuffer;
+			rcur = rcur->next;
+		}
 	}
 
 	recptr = XLogInsert(RM_HEAP_ID, info, rdata);
@@ -6010,6 +6208,112 @@ log_newpage_buffer(Buffer buffer)
 }
 
 /*
+ * Perform XLogInsert of a XLOG_HEAP2_NEW_CID record
+ *
+ * This is only used in wal_level >= WAL_LEVEL_LOGICAL
+ */
+static XLogRecPtr
+log_heap_new_cid(Relation relation, HeapTuple tup)
+{
+	xl_heap_new_cid xlrec;
+
+	XLogRecPtr	recptr;
+	XLogRecData rdata[1];
+	HeapTupleHeader hdr = tup->t_data;
+
+	Assert(ItemPointerIsValid(&tup->t_self));
+	Assert(tup->t_tableOid != InvalidOid);
+
+	xlrec.top_xid = GetTopTransactionId();
+	xlrec.target.node = relation->rd_node;
+	xlrec.target.tid = tup->t_self;
+
+	/*
+	 * if the tuple got inserted & deleted in the same TX we definitely have a
+	 * combocid, set cmin and cmax.
+	 */
+	if (hdr->t_infomask & HEAP_COMBOCID)
+	{
+		xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
+		xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
+		xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
+	}
+	/* No combocid, so only cmin or cmax can be set by this TX */
+	else
+	{
+		/* tuple inserted */
+		if (hdr->t_infomask & HEAP_XMAX_INVALID)
+		{
+			xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
+			xlrec.cmax = InvalidCommandId;
+		}
+		/* tuple from a different tx updated or deleted */
+		else
+		{
+			xlrec.cmin = InvalidCommandId;
+			xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
+
+		}
+		xlrec.combocid = InvalidCommandId;
+	}
+
+	rdata[0].data = (char *) &xlrec;
+	rdata[0].len = SizeOfHeapNewCid;
+	rdata[0].buffer = InvalidBuffer;
+	rdata[0].next = NULL;
+
+	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID, rdata);
+
+	return recptr;
+}
+
+static HeapTuple
+ExtractKeyTuple(Relation relation, HeapTuple tp)
+{
+	HeapTuple idx_tuple = NULL;
+	TupleDesc desc = RelationGetDescr(relation);
+	Relation idx_rel;
+	TupleDesc idx_desc;
+	Datum idx_vals[INDEX_MAX_KEYS];
+	bool idx_isnull[INDEX_MAX_KEYS];
+	int natt;
+
+	/* needs to already have been fetched? */
+	if (relation->rd_indexvalid == 0)
+		RelationGetIndexList(relation);
+
+	if (!OidIsValid(relation->rd_primary))
+	{
+		elog(DEBUG1, "Could not find primary key for table with oid %u",
+			 RelationGetRelid(relation));
+	}
+	else
+	{
+		idx_rel = RelationIdGetRelation(relation->rd_primary);
+		idx_desc = RelationGetDescr(idx_rel);
+
+		for (natt = 0; natt < idx_desc->natts; natt++)
+		{
+			int attno = idx_rel->rd_index->indkey.values[natt];
+			if (attno == ObjectIdAttributeNumber)
+			{
+				idx_vals[natt] = HeapTupleGetOid(tp);
+				idx_isnull[natt] = false;
+			}
+			else
+			{
+				idx_vals[natt] =
+					fastgetattr(tp, attno, desc, &idx_isnull[natt]);
+			}
+			Assert(!idx_isnull[natt]);
+		}
+		idx_tuple = heap_form_tuple(idx_desc, idx_vals, idx_isnull);
+		RelationClose(idx_rel);
+	}
+	return idx_tuple;
+}
+
+/*
  * Handles CLEANUP_INFO
  */
 static void
@@ -6370,7 +6674,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
 	 */
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 	{
 		Relation	reln = CreateFakeRelcacheEntry(xlrec->target.node);
 		Buffer		vmbuffer = InvalidBuffer;
@@ -6419,7 +6723,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 	/* Mark the page as a candidate for pruning */
 	PageSetPrunable(page, record->xl_xid);
 
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 		PageClearAllVisible(page);
 
 	/* Make sure there is no forward chain link in t_ctid */
@@ -6453,7 +6757,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
 	 */
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 	{
 		Relation	reln = CreateFakeRelcacheEntry(xlrec->target.node);
 		Buffer		vmbuffer = InvalidBuffer;
@@ -6524,7 +6828,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 
 	PageSetLSN(page, lsn);
 
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 		PageClearAllVisible(page);
 
 	MarkBufferDirty(buffer);
@@ -6587,7 +6891,7 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record)
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
 	 */
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 	{
 		Relation	reln = CreateFakeRelcacheEntry(xlrec->node);
 		Buffer		vmbuffer = InvalidBuffer;
@@ -6670,7 +6974,7 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record)
 
 	PageSetLSN(page, lsn);
 
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 		PageClearAllVisible(page);
 
 	MarkBufferDirty(buffer);
@@ -6709,7 +7013,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 		HeapTupleHeaderData hdr;
 		char		data[MaxHeapTupleSize];
 	}			tbuf;
-	xl_heap_header xlhdr;
+	xl_heap_header_len xlhdr;
 	int			hsize;
 	uint32		newlen;
 	Size		freespace;
@@ -6718,7 +7022,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
 	 */
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 	{
 		Relation	reln = CreateFakeRelcacheEntry(xlrec->target.node);
 		BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid);
@@ -6796,7 +7100,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 	/* Mark the page as a candidate for pruning */
 	PageSetPrunable(page, record->xl_xid);
 
-	if (xlrec->all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED)
 		PageClearAllVisible(page);
 
 	/*
@@ -6820,7 +7124,7 @@ newt:;
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
 	 */
-	if (xlrec->new_all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED)
 	{
 		Relation	reln = CreateFakeRelcacheEntry(xlrec->target.node);
 		BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid);
@@ -6878,13 +7182,13 @@ newsame:;
 	if (PageGetMaxOffsetNumber(page) + 1 < offnum)
 		elog(PANIC, "heap_update_redo: invalid max offset number");
 
-	hsize = SizeOfHeapUpdate + SizeOfHeapHeader;
+	hsize = SizeOfHeapUpdate + SizeOfHeapHeaderLen;
 
-	newlen = record->xl_len - hsize;
-	Assert(newlen <= MaxHeapTupleSize);
 	memcpy((char *) &xlhdr,
 		   (char *) xlrec + SizeOfHeapUpdate,
-		   SizeOfHeapHeader);
+		   SizeOfHeapHeaderLen);
+	newlen = xlhdr.t_len;
+	Assert(newlen <= MaxHeapTupleSize);
 	htup = &tbuf.hdr;
 	MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
 	/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
@@ -6892,9 +7196,9 @@ newsame:;
 		   (char *) xlrec + hsize,
 		   newlen);
 	newlen += offsetof(HeapTupleHeaderData, t_bits);
-	htup->t_infomask2 = xlhdr.t_infomask2;
-	htup->t_infomask = xlhdr.t_infomask;
-	htup->t_hoff = xlhdr.t_hoff;
+	htup->t_infomask2 = xlhdr.header.t_infomask2;
+	htup->t_infomask = xlhdr.header.t_infomask;
+	htup->t_hoff = xlhdr.header.t_hoff;
 
 	HeapTupleHeaderSetXmin(htup, record->xl_xid);
 	HeapTupleHeaderSetCmin(htup, FirstCommandId);
@@ -6906,7 +7210,7 @@ newsame:;
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_update_redo: failed to add tuple");
 
-	if (xlrec->new_all_visible_cleared)
+	if (xlrec->flags & XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED)
 		PageClearAllVisible(page);
 
 	freespace = PageGetHeapFreeSpace(page);		/* needed to update FSM below */
@@ -7157,6 +7461,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_HEAP2_LOCK_UPDATED:
 			heap_xlog_lock_updated(lsn, record);
 			break;
+		case XLOG_HEAP2_NEW_CID:
+			/* nothing to do on a real replay, only during logical decoding */
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 3ec10a0..7fe9f32 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -75,6 +75,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 	Page		page = BufferGetPage(buffer);
 	Size		minfree;
 
+	Assert(TransactionIdIsValid(OldestXmin));
+
 	/*
 	 * Let's see if we really need pruning.
 	 *
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index b878155..3bac4a5 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -67,7 +67,10 @@
 
 #include "access/relscan.h"
 #include "access/transam.h"
+#include "access/xlog.h"
+
 #include "catalog/index.h"
+#include "catalog/catalog.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -520,8 +523,15 @@ index_fetch_heap(IndexScanDesc scan)
 		 * Prune page, but only if we weren't already on this page
 		 */
 		if (prev_buf != scan->xs_cbuf)
-			heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
-								RecentGlobalXmin);
+		{
+			if (IsSystemRelation(scan->heapRelation)
+				|| RelationIsDoingTimetravel(scan->heapRelation))
+				heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+									RecentGlobalXmin);
+			else
+				heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+									RecentGlobalDataXmin);
+		}
 	}
 
 	/* Obtain share-lock on the buffer so we can examine visibility */
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index bc8b985..c750fef 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -184,6 +184,15 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->infobits_set);
 		out_target(buf, &(xlrec->target));
 	}
+	else if (info == XLOG_HEAP2_NEW_CID)
+	{
+		xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec;
+
+		appendStringInfo(buf, "new_cid: ");
+		out_target(buf, &(xlrec->target));
+		appendStringInfo(buf, "; cmin: %u, cmax: %u, combo: %u",
+						 xlrec->cmin, xlrec->cmax, xlrec->combocid);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 1b36f9a..e0900e2 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -28,6 +28,7 @@ const struct config_enum_entry wal_level_options[] = {
 	{"minimal", WAL_LEVEL_MINIMAL, false},
 	{"archive", WAL_LEVEL_ARCHIVE, false},
 	{"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
+	{"logical", WAL_LEVEL_LOGICAL, false},
 	{NULL, 0, false}
 };
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index e975f8d..d46a50e 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -47,6 +47,7 @@
 #include "access/twophase.h"
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "catalog/storage.h"
@@ -1920,7 +1921,8 @@ RecoverPreparedTransactions(void)
 			 * the prepared transaction generated xid assignment records. Test
 			 * here must match one used in AssignTransactionId().
 			 */
-			if (InHotStandby && hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS)
+			if (InHotStandby && (hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS ||
+			                     XLogLogicalInfoActive()))
 				overwriteOK = true;
 
 			/*
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0591f3f..b937ffe 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -146,6 +146,7 @@ typedef struct TransactionStateData
 	int			prevSecContext; /* previous SecurityRestrictionContext */
 	bool		prevXactReadOnly;		/* entry-time xact r/o state */
 	bool		startedInRecovery;		/* did we start in recovery? */
+	bool		guaranteedlyLogged;		/* has xid been logged? */
 	struct TransactionStateData *parent;		/* back link to parent */
 } TransactionStateData;
 
@@ -175,6 +176,7 @@ static TransactionStateData TopTransactionStateData = {
 	0,							/* previous SecurityRestrictionContext */
 	false,						/* entry-time xact r/o state */
 	false,						/* startedInRecovery */
+	false,						/* guaranteedlyLogged */
 	NULL						/* link to parent state block */
 };
 
@@ -391,6 +393,21 @@ GetCurrentTransactionIdIfAny(void)
 }
 
 /*
+ *	MarkCurrentTransactionIdLoggedIfAny
+ *
+ * Remember that the current xid - if it is assigned - now has been wal logged.
+ */
+void
+MarkCurrentTransactionIdLoggedIfAny(void)
+{
+	if (TransactionIdIsValid(CurrentTransactionState->transactionId))
+	{
+		CurrentTransactionState->guaranteedlyLogged = true;
+	}
+}
+
+
+/*
  *	GetStableLatestTransactionId
  *
  * Get the transaction's XID if it has one, else read the next-to-be-assigned
@@ -431,6 +448,7 @@ AssignTransactionId(TransactionState s)
 {
 	bool		isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
+	bool log_unknown_top = false;
 
 	/* Assert that caller didn't screw up */
 	Assert(!TransactionIdIsValid(s->transactionId));
@@ -438,7 +456,7 @@ AssignTransactionId(TransactionState s)
 
 	/*
 	 * Ensure parent(s) have XIDs, so that a child always has an XID later
-	 * than its parent.  Musn't recurse here, or we might get a stack overflow
+	 * than its parent.  May not recurse here, or we might get a stack overflow
 	 * if we're at the bottom of a huge stack of subtransactions none of which
 	 * have XIDs yet.
 	 */
@@ -455,6 +473,8 @@ AssignTransactionId(TransactionState s)
 			p = p->parent;
 		}
 
+		Assert(parentOffset);
+
 		/*
 		 * This is technically a recursive call, but the recursion will never
 		 * be more than one layer deep.
@@ -466,6 +486,21 @@ AssignTransactionId(TransactionState s)
 	}
 
 	/*
+	 * When wal_level=logical, guarantee that a subtransaction's xid can only
+	 * be seen in the WAL stream if its toplevel xid has been logged before. If
+	 * necessary we log a xact_assignment record with fewer than
+	 * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if guaranteedlyLogged
+	 * isn't set for a transaction even though it appears in a wal record,
+	 * we'll just superfluously log something.
+	 */
+	if (isSubXact && XLogLogicalInfoActive() &&
+		!TopTransactionStateData.guaranteedlyLogged)
+	{
+		log_unknown_top = true;
+	}
+
+
+	/*
 	 * Generate a new Xid and record it in PG_PROC and pg_subtrans.
 	 *
 	 * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in
@@ -519,6 +554,9 @@ AssignTransactionId(TransactionState s)
 	 * top-level transaction that each subxact belongs to. This is correct in
 	 * recovery only because aborted subtransactions are separately WAL
 	 * logged.
+	 *
+	 * This is correct even for the case where several levels above us didn't
+	 * have an xid assigned as we recursed up to them beforehand.
 	 */
 	if (isSubXact && XLogStandbyInfoActive())
 	{
@@ -529,7 +567,8 @@ AssignTransactionId(TransactionState s)
 		 * ensure this test matches similar one in
 		 * RecoverPreparedTransactions()
 		 */
-		if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS)
+		if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS ||
+		    log_unknown_top)
 		{
 			XLogRecData rdata[2];
 			xl_xact_assignment xlrec;
@@ -548,13 +587,15 @@ AssignTransactionId(TransactionState s)
 			rdata[0].next = &rdata[1];
 
 			rdata[1].data = (char *) unreportedXids;
-			rdata[1].len = PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId);
+			rdata[1].len = nUnreportedXids * sizeof(TransactionId);
 			rdata[1].buffer = InvalidBuffer;
 			rdata[1].next = NULL;
 
 			(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, rdata);
 
 			nUnreportedXids = 0;
+			/* mark top, not current xact as having been logged */
+			TopTransactionStateData.guaranteedlyLogged = true;
 		}
 	}
 }
@@ -1733,6 +1774,7 @@ StartTransaction(void)
 	 * initialize reported xid accounting
 	 */
 	nUnreportedXids = 0;
+	s->guaranteedlyLogged = false;
 
 	/*
 	 * must initialize resource-management stuff first
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fc495d6..fbb505d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -41,6 +41,7 @@
 #include "postmaster/startup.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "replication/logical.h"
 #include "storage/barrier.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
@@ -1191,6 +1192,8 @@ begin:;
 	 */
 	WALInsertSlotRelease();
 
+	MarkCurrentTransactionIdLoggedIfAny();
+
 	END_CRIT_SECTION();
 
 	/*
@@ -6332,6 +6335,13 @@ StartupXLOG(void)
 	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
 	XLogCtl->ckptXid = checkPoint.nextXid;
 
+
+	/*
+	 * Startup logical state, needs to be setup now so we have proper data
+	 * during restore. XXX
+	 */
+	StartupLogicalReplication(checkPoint.redo);
+
 	/*
 	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
 	 * control file. On recovery, all unlogged relations are blown away, so
@@ -8312,7 +8322,7 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(true, true, false, false));
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -8672,7 +8682,7 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(true, true, false, false));
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index c1287a7..0d4cfcb 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -106,7 +106,6 @@ GetDatabasePath(Oid dbNode, Oid spcNode)
 	return path;
 }
 
-
 /*
  * IsSystemRelation
  *		True iff the relation is a system catalog relation.
@@ -123,8 +122,17 @@ GetDatabasePath(Oid dbNode, Oid spcNode)
 bool
 IsSystemRelation(Relation relation)
 {
-	return IsSystemNamespace(RelationGetNamespace(relation)) ||
-		IsToastNamespace(RelationGetNamespace(relation));
+	return IsSystemRelationId(RelationGetRelid(relation));
+}
+
+/*
+ * IsSystemRelationId
+ *		True iff the relation is a system catalog relation.
+ */
+bool
+IsSystemRelationId(Oid relid)
+{
+	return relid < FirstNormalObjectId;
 }
 
 /*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index b73ee4f..49ea38b 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2174,9 +2174,20 @@ IndexBuildHeapScan(Relation heapRelation,
 	}
 	else
 	{
+		/*
+		 * We can ignore a) pegged xmins b) shared relations if we don't scan
+		 * something acting as a catalog.
+		 */
+		bool include_systables =
+			IsSystemRelation(heapRelation) ||
+			RelationIsDoingTimetravel(heapRelation);
+
 		snapshot = SnapshotAny;
 		/* okay to ignore lazy VACUUMs here */
-		OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true);
+		OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared,
+								   include_systables,
+								   true,
+								   false);
 	}
 
 	scan = heap_beginscan_strat(heapRelation,	/* relation */
@@ -3340,7 +3351,7 @@ reindex_relation(Oid relid, int flags)
 
 	/* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */
 	if (is_pg_class)
-		(void) RelationGetIndexAttrBitmap(rel, false);
+		(void) RelationGetIndexAttrBitmap(rel, INDEX_ATTR_BITMAP_ALL);
 
 	PG_TRY();
 	{
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 575a40f..2acaf54 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -613,6 +613,16 @@ CREATE VIEW pg_stat_replication AS
     WHERE S.usesysid = U.oid AND
             S.pid = W.pid;
 
+CREATE VIEW pg_stat_logical_decoding AS
+    SELECT
+            L.slot_name,
+            L.plugin,
+            L.database,
+            L.active,
+            L.xmin,
+            L.restart_decoding_lsn
+    FROM pg_stat_get_logical_decoding_slots() AS L;
+
 CREATE VIEW pg_stat_database AS
     SELECT
             D.oid AS datid,
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9845b0b..7a05cea 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1081,7 +1081,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	totalblocks = RelationGetNumberOfBlocks(onerel);
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
-	OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true);
+	OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true, true, false);
 
 	/* Prepare for sampling block numbers */
 	BlockSampler_Init(&bs, totalblocks, targrows);
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index f6a5bfe..76b2904 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -859,6 +859,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	 */
 	vacuum_set_xid_limits(freeze_min_age, freeze_table_age,
 						  OldHeap->rd_rel->relisshared,
+						  IsSystemRelation(OldHeap)
+						  || RelationIsDoingTimetravel(OldHeap),
 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff);
 
 	/*
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index d86e9ad..912f7a8 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2355,7 +2355,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 	 * concurrency.
 	 */
 	modifiedCols = GetModifiedColumns(relinfo, estate);
-	keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, true);
+	keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc,
+										 INDEX_ATTR_BITMAP_KEY);
 	if (bms_overlap(keyCols, modifiedCols))
 		lockmode = LockTupleExclusive;
 	else
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 27aea73..3528c27 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -381,6 +381,7 @@ void
 vacuum_set_xid_limits(int freeze_min_age,
 					  int freeze_table_age,
 					  bool sharedRel,
+					  bool catalogRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *freezeTableLimit,
@@ -399,7 +400,7 @@ vacuum_set_xid_limits(int freeze_min_age,
 	 * working on a particular table at any time, and that each vacuum is
 	 * always an independent transaction.
 	 */
-	*oldestXmin = GetOldestXmin(sharedRel, true);
+	*oldestXmin = GetOldestXmin(sharedRel, catalogRel, true, false);
 
 	Assert(TransactionIdIsNormal(*oldestXmin));
 
@@ -720,7 +721,7 @@ vac_update_datfrozenxid(void)
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
 	 * So we cannot produce a wrong minimum by starting with this.
 	 */
-	newFrozenXid = GetOldestXmin(true, true);
+	newFrozenXid = GetOldestXmin(true, true, true, false);
 
 	/*
 	 * Similarly, initialize the MultiXact "min" with the value that would be
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index bb4e03e..3e90a1a 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -44,6 +44,7 @@
 #include "access/multixact.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
+#include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "commands/dbcommands.h"
 #include "commands/vacuum.h"
@@ -202,6 +203,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
 	vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 						  onerel->rd_rel->relisshared,
+						  IsSystemRelation(onerel)
+						  || RelationIsDoingTimetravel(onerel),
 						  &OldestXmin, &FreezeLimit, &freezeTableLimit,
 						  &MultiXactCutoff);
 	scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index a31b01d..8a52cdc 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -818,7 +818,7 @@ PostmasterMain(int argc, char *argv[])
 				(errmsg("WAL archival (archive_mode=on) requires wal_level \"archive\" or \"hot_standby\"")));
 	if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
-				(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\" or \"hot_standby\"")));
+				(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\", \"logical\" or \"hot_standby\"")));
 
 	/*
 	 * Other one-time internal sanity checks can go here, if they are fast.
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 2dde011..2e13e27 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -17,6 +17,8 @@ override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
 OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \
 	repl_gram.o syncrep.o
 
+SUBDIRS = logical
+
 include $(top_srcdir)/src/backend/common.mk
 
 # repl_scanner is compiled as part of repl_gram
diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile
new file mode 100644
index 0000000..310a45c
--- /dev/null
+++ b/src/backend/replication/logical/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/logical
+#
+# IDENTIFICATION
+#    src/backend/replication/logical/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/logical
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
+
+OBJS = decode.o logical.o logicalfuncs.o reorderbuffer.o snapbuild.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
new file mode 100644
index 0000000..53043b9
--- /dev/null
+++ b/src/backend/replication/logical/decode.c
@@ -0,0 +1,687 @@
+/*-------------------------------------------------------------------------
+ *
+ * decode.c
+ *		Decodes WAL records fed from xlogreader.h read into an reorderbuffer
+ *		while simultaneously letting snapbuild.c build an appropriate snapshots
+ *		to decode those.
+ *
+ * NOTE:
+ * 		This basically tries to handle all low level xlog stuff for
+ *      reorderbuffer.c and snapbuild.c. There's some minor leakage where a
+ *      specific record's struct is used to pass data along, but that's just
+ *      because those are convenient and uncomplicated to read.
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/decode.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+
+#include "access/heapam.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+
+#include "catalog/pg_control.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/standby.h"
+
+/* RMGR Handlers */
+static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* individual record(group)'s handlers */
+static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+						 TransactionId xid, int nsubxacts, TransactionId *sub_xids,
+						 int ninval_msgs, SharedInvalidationMessage *msg);
+static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn,
+			TransactionId xid, TransactionId *sub_xids, int nsubxacts,
+			bool was_commit);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup);
+
+void
+DecodeRecordIntoReorderBuffer(LogicalDecodingContext *ctx,
+							  XLogRecordBuffer *buf)
+{
+	/* cast so we get a warning when new rmgrs are added */
+	switch ((RmgrIds) buf->record.xl_rmid)
+	{
+		case RM_XLOG_ID:
+			DecodeXLogOp(ctx, buf);
+			break;
+
+		case RM_XACT_ID:
+			DecodeXactOp(ctx, buf);
+			break;
+
+		case RM_STANDBY_ID:
+			DecodeStandbyOp(ctx, buf);
+			break;
+
+		case RM_HEAP_ID:
+			DecodeHeapOp(ctx, buf);
+			break;
+
+		case RM_HEAP2_ID:
+			DecodeHeap2Op(ctx, buf);
+			break;
+
+		/* irrelevant for changeset extraction */
+		case RM_SMGR_ID:
+		case RM_CLOG_ID:
+		case RM_DBASE_ID:
+		case RM_TBLSPC_ID:
+		case RM_MULTIXACT_ID:
+		case RM_RELMAP_ID:
+		case RM_BTREE_ID:
+		case RM_HASH_ID:
+		case RM_GIN_ID:
+		case RM_GIST_ID:
+		case RM_SEQ_ID:
+		case RM_SPGIST_ID:
+			break;
+		case RM_NEXT_ID:
+			elog(ERROR, "unexpected NEXT_ID record");
+	}
+}
+
+static void
+DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  	   *builder = ctx->snapshot_builder;
+	ReorderBuffer  *reorder = ctx->reorder;
+	XLogRecord	   *r = &buf->record;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (r->xl_info & ~XLR_INFO_MASK)
+	{
+		case XLOG_XACT_COMMIT:
+			{
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts = NULL;
+				SharedInvalidationMessage *invals = NULL;
+
+				xlrec = (xl_xact_commit *) buf->record_data;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				/* FIXME: skip if wrong db? */
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_PREPARED:
+			{
+				xl_xact_commit_prepared *prec;
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts;
+				SharedInvalidationMessage *invals = NULL;
+
+
+				prec = (xl_xact_commit_prepared *) buf->record_data;
+				xlrec = &prec->crec;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				/* FIXME: skip if wrong db? */
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_COMPACT:
+			{
+				xl_xact_commit_compact *xlrec;
+
+#if 0
+				/* FIXME: should we error out? */
+				elog(WARNING, "unexpectedly got compact commit");
+#endif
+				xlrec = (xl_xact_commit_compact *) buf->record_data;
+
+				DecodeCommit(ctx, buf, r->xl_xid,
+							 xlrec->nsubxacts, xlrec->subxacts,
+							 0, NULL);
+				break;
+			}
+		case XLOG_XACT_ABORT:
+			{
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				xlrec = (xl_xact_abort *) buf->record_data;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				DecodeAbort(ctx, buf->origptr, r->xl_xid,
+							sub_xids, xlrec->nsubxacts, false);
+				break;
+			}
+		case XLOG_XACT_ABORT_PREPARED:
+			{
+				xl_xact_abort_prepared *prec;
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				prec = (xl_xact_abort_prepared *) buf->record_data;
+				xlrec = &prec->arec;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				/* r->xl_xid is committed in a separate record */
+				DecodeAbort(ctx, buf->origptr, prec->xid,
+							sub_xids, xlrec->nsubxacts, false);
+				break;
+			}
+
+		case XLOG_XACT_ASSIGNMENT:
+			{
+				xl_xact_assignment *xlrec;
+				int			i;
+				TransactionId *sub_xid;
+
+				xlrec =	(xl_xact_assignment *) buf->record_data;
+
+				sub_xid = &xlrec->xsub[0];
+
+				for (i = 0; i < xlrec->nsubxacts; i++)
+				{
+					ReorderBufferAssignChild(reorder, xlrec->xtop,
+											 *(sub_xid++), buf->origptr);
+				}
+				break;
+			}
+		case XLOG_XACT_PREPARE:
+
+			/*
+			 * XXX: we could replay the transaction and prepare it
+			 * as well.
+			 */
+			break;
+		default:
+			break;
+	}
+}
+
+static void
+DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+	XLogRecord	   *r = &buf->record;
+
+	switch (r->xl_info & ~XLR_INFO_MASK)
+	{
+		case XLOG_RUNNING_XACTS:
+			SnapBuildProcessRunningXacts(builder, buf->origptr,
+										 (xl_running_xacts *) buf->record_data);
+			break;
+		case XLOG_STANDBY_LOCK:
+			break;
+		default:
+			elog(ERROR, "unexpected standby record type");
+	}
+}
+static void
+DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	switch (buf->record.xl_info & ~XLR_INFO_MASK)
+	{
+		/* this is also used in END_OF_RECOVERY checkpoints */
+		case XLOG_CHECKPOINT_SHUTDOWN:
+		case XLOG_END_OF_RECOVERY:
+			SnapBuildSerializationPoint(builder, buf->origptr);
+
+			/*
+			 * abort all transactions that still deemed to be in progress, they
+			 * aren't actually in progress anymore. Do not abort prepared
+			 * transactions that have been prepared for commit.
+			 *
+			 * FIXME: implement.
+			 */
+			break;
+		case XLOG_CHECKPOINT_ONLINE:
+			/*
+			 * a RUNNING_XACTS record will have been logged near to this, we
+			 * can restart from there.
+			 */
+			break;
+		case XLOG_NOOP:
+		case XLOG_NEXTOID:
+		case XLOG_SWITCH:
+		case XLOG_BACKUP_END:
+		case XLOG_PARAMETER_CHANGE:
+		case XLOG_RESTORE_POINT:
+		case XLOG_FPW_CHANGE:
+		case XLOG_FPI:
+			break;
+	}
+}
+
+static void
+DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeInsert(ctx, buf);
+			break;
+
+			/*
+			 * Treat HOT update as normal updates, there is no useful
+			 * information in the fact that we could make it a HOT update
+			 * locally and the WAL layout is compatible.
+			 */
+		case XLOG_HEAP_HOT_UPDATE:
+		case XLOG_HEAP_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeUpdate(ctx, buf);
+			break;
+
+		case XLOG_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeDelete(ctx, buf);
+			break;
+
+		case XLOG_HEAP_NEWPAGE:
+			/*
+			 * XXX: There doesn't seem to be a usecase for decoding
+			 * HEAP_NEWPAGE's. Its only used in various indexam's and CLUSTER,
+			 * neither of which should be relevant for the logical
+			 * changestream.
+			 */
+			break;
+		case XLOG_HEAP_INPLACE:
+			/* cannot be important for our purposes, not part of transaction */
+			if (!TransactionIdIsValid(xid))
+				break;
+
+			SnapBuildProcessChange(builder, xid, buf->origptr);
+			/* heap_inplace is only done in catalog modifying txns */
+			ReorderBufferXidSetTimetravel(ctx->reorder, xid, buf->origptr);
+			break;
+		case XLOG_HEAP_LOCK:
+			break;
+		default:
+			elog(ERROR, "unexpected info value %u", info);
+			break;
+	}
+}
+
+static void
+DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP2_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeMultiInsert(ctx, buf);
+			break;
+		case XLOG_HEAP2_NEW_CID:
+			{
+				xl_heap_new_cid *xlrec;
+				xlrec = (xl_heap_new_cid *) buf->record_data;
+				SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec);
+
+				break;
+			}
+			/*
+			 * everything else here is just low level stuff we're not
+			 * interested in
+			 */
+		case XLOG_HEAP2_FREEZE:
+		case XLOG_HEAP2_CLEAN:
+		case XLOG_HEAP2_CLEANUP_INFO:
+		case XLOG_HEAP2_VISIBLE:
+		case XLOG_HEAP2_LOCK_UPDATED:
+			break;
+		default:
+			elog(ERROR, "unexpected info value %u", info);
+	}
+}
+
+static void
+DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, TransactionId xid,
+			 int nsubxacts, TransactionId *sub_xids,
+			 int ninval_msgs, SharedInvalidationMessage *msgs)
+{
+	int			i;
+
+	/* always need the invalidation messages */
+	if (ninval_msgs > 0)
+	{
+		ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
+									  ninval_msgs, msgs);
+		ReorderBufferXidSetTimetravel(ctx->reorder, xid, buf->origptr);
+	}
+
+	SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
+					   nsubxacts, sub_xids);
+
+	/*
+	 * If we are not interested in anything up to this LSN convert the commit
+	 * into an ABORT to cleanup.
+	 *
+	 * FIXME: this needs to replay invalidations anyway!
+	 */
+	if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr))
+	{
+		DecodeAbort(ctx, buf->origptr, xid,	sub_xids, nsubxacts, true);
+		return;
+	}
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferCommitChild(ctx->reorder, xid, *sub_xids,
+								 buf->origptr, buf->endptr);
+		sub_xids++;
+	}
+
+	/* replay actions of all transaction + subtransactions in order */
+	ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr);
+}
+
+static void
+DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+			TransactionId *sub_xids, int nsubxacts, bool was_commit)
+{
+	int			i;
+
+	/*
+	 * this is a bit grotty, but if we're "faking" an abort we've already gone
+	 * through
+	 */
+	if (!was_commit)
+		SnapBuildAbortTxn(ctx->snapshot_builder, xid,
+						  nsubxacts, sub_xids);
+
+
+	/* FIXME: process invalidations anyway if was_commit */
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferAbort(ctx->reorder, *sub_xids, lsn);
+		sub_xids++;
+	}
+
+	ReorderBufferAbort(ctx->reorder, xid, lsn);
+}
+
+static void
+DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_insert *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_insert *) buf->record_data;
+
+	/* XXX: nicer */
+	if (xlrec->target.node.dbNode != ctx->slot->database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_INSERT;
+	memcpy(&change->relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader));
+
+		change->newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert,
+						r->xl_len - SizeOfHeapInsert,
+						change->newtuple);
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+static void
+DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_update *xlrec;
+	xl_heap_header_len *xlhdr;
+	ReorderBufferChange *change;
+	char	   *data;
+
+	xlrec = (xl_heap_update *) buf->record_data;
+	xlhdr = (xl_heap_header_len *) (buf->record_data + SizeOfHeapUpdate);
+
+	/* XXX: nicer */
+	if (xlrec->target.node.dbNode != ctx->slot->database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	memcpy(&change->relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	data = (char *) &xlhdr->header;
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen));
+
+		change->newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple(data,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->newtuple);
+		/* skip over the rest of the tuple header */
+		data += SizeOfHeapHeader;
+		/* skip over the tuple data */
+		data += xlhdr->t_len;
+	}
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD_KEY)
+	{
+		xlhdr = (xl_heap_header_len *) data;
+		change->oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+		DecodeXLogTuple((char *) &xlhdr->header,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->oldtuple);
+		data = (char *) &xlhdr->header;
+		data += SizeOfHeapHeader;
+		data += xlhdr->t_len;
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+static void
+DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_delete *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_delete *) buf->record_data;
+
+	/* XXX: nicer */
+	if (xlrec->target.node.dbNode != ctx->slot->database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	memcpy(&change->relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD_KEY)
+	{
+		Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader));
+
+		change->oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete,
+						r->xl_len - SizeOfHeapDelete,
+						change->oldtuple);
+	}
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Decode xl_heap_multi_insert record into multiple changes.
+ */
+static void
+DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	bool		isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0;
+
+	xlrec = (xl_heap_multi_insert *) buf->record_data;
+
+	/* XXX: nicer */
+	if (xlrec->node.dbNode != ctx->slot->database)
+		return;
+
+	data = buf->record_data + SizeOfHeapMultiInsert;
+
+	/*
+	 * OffsetNumbers (which are not of interest to us) are stored when
+	 * XLOG_HEAP_INIT_PAGE is not set -- skip over them.
+	 */
+	if (!isinit)
+		data += sizeof(OffsetNumber) * xlrec->ntuples;
+
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_multi_insert_tuple *xlhdr;
+		int			datalen;
+		ReorderBufferTupleBuf *tuple;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		memcpy(&change->relnode, &xlrec->node, sizeof(RelFileNode));
+
+		/*
+		 * CONTAINS_NEW_TUPLE will always be set currently as multi_insert
+		 * isn't used for catalogs, but better be future proof.
+		 *
+		 * We decode the tuple in pretty much the same way as DecodeXLogTuple,
+		 * but since the layout is slightly different, we can't use it here.
+		 */
+		if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+		{
+			change->newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+			tuple = change->newtuple;
+
+			/* not a disk based tuple */
+			ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data);
+			data = ((char *) xlhdr) + SizeOfMultiInsertTuple;
+			datalen = xlhdr->datalen;
+
+			/* we can only figure this out after reassembling the transactions */
+			tuple->tuple.t_tableOid = InvalidOid;
+			tuple->tuple.t_data = &tuple->header;
+			tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits);
+
+			memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+			memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits),
+				   (char *) data,
+				   datalen);
+			data += datalen;
+
+			tuple->header.t_infomask = xlhdr->t_infomask;
+			tuple->header.t_infomask2 = xlhdr->t_infomask2;
+			tuple->header.t_hoff = xlhdr->t_hoff;
+		}
+
+		ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+	}
+}
+
+/*
+ * Read a tuple of size 'len' from 'data' into 'tuple'.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
+{
+	xl_heap_header xlhdr;
+	int			datalen = len - SizeOfHeapHeader;
+
+	Assert(datalen >= 0);
+	Assert(datalen <= MaxHeapTupleSize);
+
+	tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits);
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->tuple.t_tableOid = InvalidOid;
+	tuple->tuple.t_data = &tuple->header;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfHeapHeader);
+
+	memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+	memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits),
+		   data + SizeOfHeapHeader,
+		   datalen);
+
+	tuple->header.t_infomask = xlhdr.t_infomask;
+	tuple->header.t_infomask2 = xlhdr.t_infomask2;
+	tuple->header.t_hoff = xlhdr.t_hoff;
+}
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
new file mode 100644
index 0000000..656e995
--- /dev/null
+++ b/src/backend/replication/logical/logical.c
@@ -0,0 +1,1046 @@
+/*-------------------------------------------------------------------------
+ *
+ * logical.c
+ *
+ *	   Logical decoding shared memory management
+ *
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/logical.c
+ *
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "access/transam.h"
+
+#include "fmgr.h"
+#include "miscadmin.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+#include "storage/ipc.h"
+#include "storage/procarray.h"
+#include "storage/fd.h"
+
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+
+/*
+ * logical replication on-disk data structures.
+ */
+typedef struct LogicalDecodingSlotOnDisk
+{
+	uint32		magic;
+	LogicalDecodingSlot slot;
+} LogicalDecodingSlotOnDisk;
+
+#define LOGICAL_MAGIC	0x1051CA1		/* format identifier */
+
+/* Control array for logical decoding */
+LogicalDecodingCtlData *LogicalDecodingCtl = NULL;
+
+/* My slot for logical rep in the shared memory array */
+LogicalDecodingSlot *MyLogicalDecodingSlot = NULL;
+
+/* user settable parameters */
+int			max_logical_slots = 0;		/* the maximum number of logical slots */
+
+static void LogicalSlotKill(int code, Datum arg);
+
+/* persistency functions */
+static void RestoreLogicalSlot(const char *name);
+static void CreateLogicalSlot(LogicalDecodingSlot *slot);
+static void SaveLogicalSlot(LogicalDecodingSlot *slot);
+static void SaveLogicalSlotInternal(LogicalDecodingSlot *slot, const char *path);
+static void DeleteLogicalSlot(LogicalDecodingSlot *slot);
+
+
+/* Report shared-memory space needed by LogicalDecodingShmemInit */
+Size
+LogicalDecodingShmemSize(void)
+{
+	Size		size = 0;
+
+	if (max_logical_slots == 0)
+		return size;
+
+	size = offsetof(LogicalDecodingCtlData, logical_slots);
+	size = add_size(size,
+					mul_size(max_logical_slots, sizeof(LogicalDecodingSlot)));
+
+	return size;
+}
+
+/* Allocate and initialize walsender-related shared memory */
+void
+LogicalDecodingShmemInit(void)
+{
+	bool		found;
+
+	if (max_logical_slots == 0)
+		return;
+
+	LogicalDecodingCtl = (LogicalDecodingCtlData *)
+		ShmemInitStruct("Logical Decoding Ctl", LogicalDecodingShmemSize(),
+						&found);
+
+	if (!found)
+	{
+		int			i;
+
+		/* First time through, so initialize */
+		MemSet(LogicalDecodingCtl, 0, LogicalDecodingShmemSize());
+
+		LogicalDecodingCtl->xmin = InvalidTransactionId;
+
+		for (i = 0; i < max_logical_slots; i++)
+		{
+			LogicalDecodingSlot *slot =
+			&LogicalDecodingCtl->logical_slots[i];
+
+			slot->xmin = InvalidTransactionId;
+			slot->effective_xmin = InvalidTransactionId;
+			SpinLockInit(&slot->mutex);
+		}
+	}
+}
+
+static void
+LogicalSlotKill(int code, Datum arg)
+{
+	/* LOCK? */
+	if (MyLogicalDecodingSlot && MyLogicalDecodingSlot->active)
+	{
+		MyLogicalDecodingSlot->active = false;
+	}
+	MyLogicalDecodingSlot = NULL;
+}
+
+/*
+ * Set the xmin required for catalog timetravel for the specific decoding slot.
+ */
+void
+IncreaseLogicalXminForSlot(XLogRecPtr lsn, TransactionId xmin)
+{
+	Assert(MyLogicalDecodingSlot != NULL);
+
+	SpinLockAcquire(&MyLogicalDecodingSlot->mutex);
+
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly.
+	 */
+	if (MyLogicalDecodingSlot->candidate_lsn == InvalidXLogRecPtr ||
+		(lsn == MyLogicalDecodingSlot->candidate_lsn &&
+		 !TransactionIdIsValid(MyLogicalDecodingSlot->candidate_xmin)))
+	{
+		MyLogicalDecodingSlot->candidate_lsn = lsn;
+		MyLogicalDecodingSlot->candidate_xmin = xmin;
+		elog(DEBUG1, "got new xmin %u at %X/%X", xmin,
+			 (uint32) (lsn >> 32), (uint32) lsn);
+	}
+	SpinLockRelease(&MyLogicalDecodingSlot->mutex);
+}
+
+void
+IncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
+{
+	Assert(MyLogicalDecodingSlot != NULL);
+	Assert(restart_lsn != InvalidXLogRecPtr);
+	Assert(current_lsn != InvalidXLogRecPtr);
+
+	SpinLockAcquire(&MyLogicalDecodingSlot->mutex);
+
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly. A missed
+	 * value here will just cause some extra effort after reconnecting.
+	 */
+	if (MyLogicalDecodingSlot->candidate_lsn == InvalidXLogRecPtr ||
+		(current_lsn == MyLogicalDecodingSlot->candidate_lsn &&
+	 MyLogicalDecodingSlot->candidate_restart_decoding == InvalidXLogRecPtr))
+	{
+		MyLogicalDecodingSlot->candidate_lsn = current_lsn;
+		MyLogicalDecodingSlot->candidate_restart_decoding = restart_lsn;
+
+		elog(DEBUG1, "got new restart lsn %X/%X at %X/%X",
+			 (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+			 (uint32) (current_lsn >> 32), (uint32) current_lsn);
+
+	}
+	SpinLockRelease(&MyLogicalDecodingSlot->mutex);
+}
+
+void
+LogicalConfirmReceivedLocation(XLogRecPtr lsn)
+{
+	Assert(lsn != InvalidXLogRecPtr);
+
+	/* Do an unlocked check for candidate_lsn first. */
+	if (MyLogicalDecodingSlot->candidate_lsn != InvalidXLogRecPtr)
+	{
+		bool		updated_xmin = false;
+		bool		updated_restart = false;
+
+		/* use volatile pointer to prevent code rearrangement */
+		volatile LogicalDecodingSlot *slot = MyLogicalDecodingSlot;
+
+		SpinLockAcquire(&slot->mutex);
+
+		slot->confirmed_flush = lsn;
+
+		/* if were past the location required for bumping xmin, do so */
+		if (slot->candidate_lsn != InvalidXLogRecPtr &&
+			slot->candidate_lsn < lsn)
+		{
+			/*
+			 * We have to write the changed xmin to disk *before* we change
+			 * the in-memory value, otherwise after a crash we wouldn't know
+			 * that some catalog tuples might have been removed already.
+			 *
+			 * Ensure that by first writing to ->xmin and only update
+			 * ->effective_xmin once the new state is fsynced to disk. After a
+			 * crash ->effective_xmin is set to ->xmin.
+			 */
+			if (TransactionIdIsValid(slot->candidate_xmin) &&
+				slot->xmin != slot->candidate_xmin)
+			{
+				slot->xmin = slot->candidate_xmin;
+				updated_xmin = true;
+			}
+
+			if (slot->candidate_restart_decoding != InvalidXLogRecPtr &&
+				slot->restart_decoding != slot->candidate_restart_decoding)
+			{
+				slot->restart_decoding = slot->candidate_restart_decoding;
+				updated_restart = true;
+			}
+
+			slot->candidate_lsn = InvalidXLogRecPtr;
+			slot->candidate_xmin = InvalidTransactionId;
+			slot->candidate_restart_decoding = InvalidXLogRecPtr;
+		}
+
+		SpinLockRelease(&slot->mutex);
+
+		/* first write new xmin to disk, so we know whats up after a crash */
+		if (updated_xmin || updated_restart)
+			/* cast away volatile, thats ok. */
+			SaveLogicalSlot((LogicalDecodingSlot *) slot);
+
+		/*
+		 * now the new xmin is safely on disk, we can let the global value
+		 * advance
+		 */
+		if (updated_xmin)
+		{
+			SpinLockAcquire(&slot->mutex);
+			slot->effective_xmin = slot->xmin;
+			SpinLockRelease(&slot->mutex);
+
+			ComputeLogicalXmin();
+		}
+	}
+	else
+	{
+		volatile LogicalDecodingSlot *slot = MyLogicalDecodingSlot;
+
+		SpinLockAcquire(&slot->mutex);
+		slot->confirmed_flush = lsn;
+		SpinLockRelease(&slot->mutex);
+	}
+}
+
+/*
+ * Compute the xmin between all of the decoding slots and store it in
+ * WalSndCtlData.
+ */
+void
+ComputeLogicalXmin(void)
+{
+	int			i;
+	TransactionId xmin = InvalidTransactionId;
+	LogicalDecodingSlot *slot;
+
+	Assert(LogicalDecodingCtl);
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		slot = &LogicalDecodingCtl->logical_slots[i];
+
+		SpinLockAcquire(&slot->mutex);
+		if (slot->in_use &&
+			TransactionIdIsValid(slot->effective_xmin) && (
+											   !TransactionIdIsValid(xmin) ||
+						   TransactionIdPrecedes(slot->effective_xmin, xmin))
+			)
+		{
+			xmin = slot->effective_xmin;
+		}
+		SpinLockRelease(&slot->mutex);
+	}
+	LogicalDecodingCtl->xmin = xmin;
+	LWLockRelease(ProcArrayLock);
+
+	elog(DEBUG1, "computed new global xmin for decoding: %u", xmin);
+}
+
+/*
+ * Make sure the current settings & environment are capable of doing logical
+ * replication.
+ */
+void
+CheckLogicalReplicationRequirements(void)
+{
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		ereport(ERROR,
+		/* XXX invent class 51 for code 51028? */
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical replication requires wal_level=logical")));
+
+	if (MyDatabaseId == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical replication requires to be connected to a database")));
+
+	if (max_logical_slots == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("logical replication requires needs max_logical_slots > 0"))));
+}
+
+/*
+ * Search for a free slot, mark it as used and acquire a valid xmin horizon
+ * value.
+ */
+void
+LogicalDecodingAcquireFreeSlot(const char *name, const char *plugin)
+{
+	LogicalDecodingSlot *slot;
+	bool		name_in_use;
+	int			i;
+
+	Assert(!MyLogicalDecodingSlot);
+
+	CheckLogicalReplicationRequirements();
+
+	LWLockAcquire(LogicalReplicationCtlLock, LW_EXCLUSIVE);
+
+	/* First, make sure the requested name is not in use. */
+
+	name_in_use = false;
+	for (i = 0; i < max_logical_slots && !name_in_use; i++)
+	{
+		LogicalDecodingSlot *s = &LogicalDecodingCtl->logical_slots[i];
+
+		SpinLockAcquire(&s->mutex);
+		if (s->in_use && strcmp(name, NameStr(s->name)) == 0)
+			name_in_use = true;
+		SpinLockRelease(&s->mutex);
+	}
+
+	if (name_in_use)
+		ereport(ERROR,
+				(errcode(ERRCODE_DUPLICATE_OBJECT),
+			  errmsg("There already is a logical slot named \"%s\"", name)));
+
+	/* Find the first available (not in_use (=> not active)) slot. */
+
+	slot = NULL;
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		LogicalDecodingSlot *s = &LogicalDecodingCtl->logical_slots[i];
+
+		SpinLockAcquire(&s->mutex);
+		if (!s->in_use)
+		{
+			Assert(!s->active);
+			/* NOT releasing the lock yet */
+			slot = s;
+			break;
+		}
+		SpinLockRelease(&s->mutex);
+	}
+
+	LWLockRelease(LogicalReplicationCtlLock);
+
+	if (!slot)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+				 errmsg("couldn't find free logical slot. free one or increase max_logical_slots")));
+
+	MyLogicalDecodingSlot = slot;
+
+	/* Lets start with enough information if we can */
+	if (!RecoveryInProgress())
+		slot->restart_decoding = LogStandbySnapshot();
+	else
+		slot->restart_decoding = GetRedoRecPtr();
+
+	slot->in_use = true;
+	slot->active = true;
+	slot->database = MyDatabaseId;
+	/* XXX: do we want to use truncate identifier instead? */
+	strncpy(NameStr(slot->plugin), plugin, NAMEDATALEN);
+	NameStr(slot->plugin)[NAMEDATALEN - 1] = '\0';
+	strncpy(NameStr(slot->name), name, NAMEDATALEN);
+	NameStr(slot->name)[NAMEDATALEN - 1] = '\0';
+
+	/* Arrange to clean up at exit/error */
+	on_shmem_exit(LogicalSlotKill, 0);
+
+	/* release slot so it can be examined by others */
+	SpinLockRelease(&slot->mutex);
+
+	/* XXX: verify that the specified plugin is valid */
+
+	/*
+	 * Acquire the current global xmin value and directly set the logical xmin
+	 * before releasing the lock if necessary. We do this so wal decoding is
+	 * guaranteed to have all catalog rows produced by xacts with an xid >
+	 * walsnd->xmin available.
+	 *
+	 * We can't use ComputeLogicalXmin here as that acquires ProcArrayLock
+	 * separately which would open a short window for the global xmin to
+	 * advance above walsnd->xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	slot->effective_xmin = GetOldestXmin(true, true, true, true);
+	slot->xmin = slot->effective_xmin;
+
+	if (!TransactionIdIsValid(LogicalDecodingCtl->xmin) ||
+		NormalTransactionIdPrecedes(slot->effective_xmin, LogicalDecodingCtl->xmin))
+		LogicalDecodingCtl->xmin = slot->effective_xmin;
+	LWLockRelease(ProcArrayLock);
+
+	Assert(slot->effective_xmin <= GetOldestXmin(true, true, true, false));
+
+	LWLockAcquire(LogicalReplicationCtlLock, LW_EXCLUSIVE);
+	CreateLogicalSlot(slot);
+	LWLockRelease(LogicalReplicationCtlLock);
+}
+
+/*
+ * Find an previously initiated slot and mark it as used again.
+ */
+void
+LogicalDecodingReAcquireSlot(const char *name)
+{
+	LogicalDecodingSlot *slot;
+	int			i;
+
+	CheckLogicalReplicationRequirements();
+
+	Assert(!MyLogicalDecodingSlot);
+
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		slot = &LogicalDecodingCtl->logical_slots[i];
+
+		SpinLockAcquire(&slot->mutex);
+		if (slot->in_use && strcmp(name, NameStr(slot->name)) == 0)
+		{
+			MyLogicalDecodingSlot = slot;
+			/* NOT releasing the lock yet */
+			break;
+		}
+		SpinLockRelease(&slot->mutex);
+	}
+
+	if (!MyLogicalDecodingSlot)
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("couldn't find logical slot \"%s\"", name)));
+
+	slot = MyLogicalDecodingSlot;
+
+	if (slot->active)
+	{
+		SpinLockRelease(&slot->mutex);
+		MyLogicalDecodingSlot = NULL;
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("slot already active")));
+	}
+
+	slot->active = true;
+	/* now that we've marked it as active, we release our lock */
+	SpinLockRelease(&slot->mutex);
+
+	/* Don't let the user switch the database... */
+	if (slot->database != MyDatabaseId)
+	{
+		SpinLockAcquire(&slot->mutex);
+		slot->active = false;
+		SpinLockRelease(&slot->mutex);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("START_LOGICAL_REPLICATION needs to be run in the same database as INIT_LOGICAL_REPLICATION"))));
+	}
+
+	/* Arrange to clean up at exit */
+	on_shmem_exit(LogicalSlotKill, 0);
+
+	SaveLogicalSlot(slot);
+}
+
+/*
+  * Temporarily remove a logical decoding slot, this or another backend can
+  * reacquire it later.
+ */
+void
+LogicalDecodingReleaseSlot(void)
+{
+	LogicalDecodingSlot *slot;
+
+	CheckLogicalReplicationRequirements();
+
+	slot = MyLogicalDecodingSlot;
+
+	Assert(slot != NULL && slot->active);
+
+	SpinLockAcquire(&slot->mutex);
+	slot->active = false;
+	SpinLockRelease(&slot->mutex);
+
+	MyLogicalDecodingSlot = NULL;
+
+	SaveLogicalSlot(slot);
+
+	cancel_shmem_exit(LogicalSlotKill, 0);
+}
+
+/*
+ * Permanently remove a logical decoding slot.
+ */
+void
+LogicalDecodingFreeSlot(const char *name)
+{
+	LogicalDecodingSlot *slot = NULL;
+	int			i;
+
+	CheckLogicalReplicationRequirements();
+
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		slot = &LogicalDecodingCtl->logical_slots[i];
+
+		SpinLockAcquire(&slot->mutex);
+		if (slot->in_use && strcmp(name, NameStr(slot->name)) == 0)
+		{
+			/* NOT releasing the lock yet */
+			break;
+		}
+		SpinLockRelease(&slot->mutex);
+		slot = NULL;
+	}
+
+	if (!slot)
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("couldn't find logical slot \"%s\"", name)));
+
+	if (slot->active)
+	{
+		SpinLockRelease(&slot->mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("cannot free active logical slot \"%s\"", name)));
+	}
+
+	/*
+	 * Mark it as as active, so nobody can claim this slot while we are
+	 * working on it. We don't want to hold the spinlock while doing stuff
+	 * like fsyncing the state file to disk.
+	 */
+	slot->active = true;
+
+	SpinLockRelease(&slot->mutex);
+
+	/*
+	 * Start critical section, we can't to be interrupted while on-disk/memory
+	 * state aren't coherent.
+	 */
+	START_CRIT_SECTION();
+
+	DeleteLogicalSlot(slot);
+
+	/* ok, everything gone, after a crash we now wouldn't restore this slot */
+	SpinLockAcquire(&slot->mutex);
+	slot->active = false;
+	slot->in_use = false;
+	SpinLockRelease(&slot->mutex);
+
+	END_CRIT_SECTION();
+
+	/* slot is dead and doesn't nail the xmin anymore */
+	ComputeLogicalXmin();
+}
+
+/*
+ * Load replication state from disk into memory at server startup.
+ */
+void
+StartupLogicalReplication(XLogRecPtr checkPointRedo)
+{
+	DIR		   *logical_dir;
+	struct dirent *logical_de;
+
+	ereport(DEBUG1,
+			(errmsg("starting up logical decoding from %X/%X",
+					(uint32) (checkPointRedo >> 32), (uint32) checkPointRedo)));
+
+	/* restore all slots */
+	logical_dir = AllocateDir("pg_llog");
+	while ((logical_de = ReadDir(logical_dir, "pg_llog")) != NULL)
+	{
+		if (strcmp(logical_de->d_name, ".") == 0 ||
+			strcmp(logical_de->d_name, "..") == 0)
+			continue;
+
+		/* one of our own directories */
+		if (strcmp(logical_de->d_name, "snapshots") == 0)
+			continue;
+
+		/* we crashed while a slot was being setup or deleted, clean up */
+		if (strcmp(logical_de->d_name, "new") == 0 ||
+			strcmp(logical_de->d_name, "old") == 0)
+		{
+			char		path[MAXPGPATH];
+
+			sprintf(path, "pg_llog/%s", logical_de->d_name);
+
+			if (!rmtree(path, true))
+			{
+				FreeDir(logical_dir);
+				ereport(PANIC,
+						(errcode_for_file_access(),
+						 errmsg("could not remove directory \"%s\": %m",
+								path)));
+			}
+			continue;
+		}
+
+		RestoreLogicalSlot(logical_de->d_name);
+	}
+	FreeDir(logical_dir);
+
+	if (max_logical_slots <= 0)
+		return;
+
+	/* Now that we have recovered all the data, compute logical xmin */
+	ComputeLogicalXmin();
+
+	ReorderBufferStartup();
+}
+
+/* ----
+ * Manipulation of ondisk state of logical slots
+ * ----
+ */
+static void
+CreateLogicalSlot(LogicalDecodingSlot *slot)
+{
+	char		tmppath[MAXPGPATH];
+	char		path[MAXPGPATH];
+
+	START_CRIT_SECTION();
+
+	sprintf(tmppath, "pg_llog/new");
+	sprintf(path, "pg_llog/%s", NameStr(slot->name));
+
+	if (mkdir(tmppath, S_IRWXU) < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not create directory \"%s\": %m",
+						tmppath)));
+
+	fsync_fname(tmppath, true);
+
+	SaveLogicalSlotInternal(slot, tmppath);
+
+	if (rename(tmppath, path) != 0)
+	{
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not rename logical checkpoint from \"%s\" to \"%s\": %m",
+						tmppath, path)));
+	}
+
+	fsync_fname(path, true);
+
+	END_CRIT_SECTION();
+}
+
+static void
+SaveLogicalSlot(LogicalDecodingSlot *slot)
+{
+	char		path[MAXPGPATH];
+
+	sprintf(path, "pg_llog/%s", NameStr(slot->name));
+	SaveLogicalSlotInternal(slot, path);
+}
+
+/*
+ * Shared functionality between saving and creating a logical slot.
+ */
+static void
+SaveLogicalSlotInternal(LogicalDecodingSlot *slot, const char *dir)
+{
+	char		tmppath[MAXPGPATH];
+	char		path[MAXPGPATH];
+	int			fd;
+	LogicalDecodingSlotOnDisk cp;
+
+	/* silence valgrind :( */
+	memset(&cp, 0, sizeof(LogicalDecodingSlotOnDisk));
+
+	sprintf(tmppath, "%s/state.tmp", dir);
+	sprintf(path, "%s/state", dir);
+
+	START_CRIT_SECTION();
+
+	fd = OpenTransientFile(tmppath,
+						   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not create logical checkpoint file \"%s\": %m",
+						tmppath)));
+
+	cp.magic = LOGICAL_MAGIC;
+
+	SpinLockAcquire(&slot->mutex);
+
+	cp.slot.xmin = slot->xmin;
+	cp.slot.effective_xmin = slot->effective_xmin;
+
+	strcpy(NameStr(cp.slot.name), NameStr(slot->name));
+	strcpy(NameStr(cp.slot.plugin), NameStr(slot->plugin));
+
+	cp.slot.database = slot->database;
+	cp.slot.confirmed_flush = slot->confirmed_flush;
+	cp.slot.restart_decoding = slot->restart_decoding;
+	cp.slot.candidate_lsn = InvalidXLogRecPtr;
+	cp.slot.candidate_xmin = InvalidTransactionId;
+	cp.slot.candidate_restart_decoding = InvalidXLogRecPtr;
+	cp.slot.in_use = slot->in_use;
+	cp.slot.active = false;
+
+	SpinLockRelease(&slot->mutex);
+
+	if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))
+	{
+		CloseTransientFile(fd);
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not write logical checkpoint file \"%s\": %m",
+						tmppath)));
+	}
+
+	/* fsync the file */
+	if (pg_fsync(fd) != 0)
+	{
+		CloseTransientFile(fd);
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync logical checkpoint \"%s\": %m",
+						tmppath)));
+	}
+
+	CloseTransientFile(fd);
+
+	/* rename to permanent file, fsync file and directory */
+	if (rename(tmppath, path) != 0)
+	{
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not rename logical checkpoint from \"%s\" to \"%s\": %m",
+						tmppath, path)));
+	}
+
+	fsync_fname((char *) dir, true);
+	fsync_fname(path, false);
+
+	END_CRIT_SECTION();
+}
+
+
+static void
+DeleteLogicalSlot(LogicalDecodingSlot *slot)
+{
+	char		path[MAXPGPATH];
+	char		tmppath[] = "pg_llog/old";
+
+	START_CRIT_SECTION();
+
+	sprintf(path, "pg_llog/%s", NameStr(slot->name));
+
+	if (rename(path, tmppath) != 0)
+	{
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not rename logical checkpoint from \"%s\" to \"%s\": %m",
+						path, tmppath)));
+	}
+
+	/* make sure no partial state is visible after a crash */
+	fsync_fname(tmppath, true);
+	fsync_fname("pg_llog", true);
+
+	if (!rmtree(tmppath, true))
+	{
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not remove directory \"%s\": %m",
+						tmppath)));
+	}
+
+	END_CRIT_SECTION();
+}
+
+/*
+ * Load a single ondisk slot into memory.
+ */
+static void
+RestoreLogicalSlot(const char *name)
+{
+	LogicalDecodingSlotOnDisk cp;
+	int			i;
+	char		path[MAXPGPATH];
+	int			fd;
+	bool		restored = false;
+	int			readBytes;
+
+	START_CRIT_SECTION();
+
+	/* delete temp file if it exists */
+	sprintf(path, "pg_llog/%s/state.tmp", name);
+	if (unlink(path) < 0 && errno != ENOENT)
+		ereport(PANIC, (errmsg("failed while unlinking %s", path)));
+
+	sprintf(path, "pg_llog/%s/state", name);
+
+	elog(DEBUG1, "restoring logical slot from %s", path);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+	/*
+	 * We do not need to handle this as we are rename()ing the directory into
+	 * place only after we fsync()ed the state file.
+	 */
+	if (fd < 0)
+		ereport(PANIC, (errmsg("could not open state file %s", path)));
+
+	readBytes = read(fd, &cp, sizeof(cp));
+	if (readBytes != sizeof(cp))
+	{
+		int			saved_errno = errno;
+
+		CloseTransientFile(fd);
+		errno = saved_errno;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not read logical checkpoint file \"%s\": %m, read %d of %zu",
+						path, readBytes, sizeof(cp))));
+	}
+
+	CloseTransientFile(fd);
+
+	if (cp.magic != LOGICAL_MAGIC)
+		ereport(PANIC, (errmsg("Logical checkpoint has wrong magic %u instead of %u",
+							   cp.magic, LOGICAL_MAGIC)));
+
+	/* nothing can be active yet, don't lock anything */
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		LogicalDecodingSlot *slot;
+
+		slot = &LogicalDecodingCtl->logical_slots[i];
+
+		if (slot->in_use)
+			continue;
+
+		slot->xmin = cp.slot.xmin;
+		/* XXX: after a crash, always use xmin, not effective_xmin */
+		slot->effective_xmin = cp.slot.xmin;
+		strcpy(NameStr(slot->name), NameStr(cp.slot.name));
+		strcpy(NameStr(slot->plugin), NameStr(cp.slot.plugin));
+		slot->database = cp.slot.database;
+		slot->restart_decoding = cp.slot.restart_decoding;
+		slot->confirmed_flush = cp.slot.confirmed_flush;
+		slot->candidate_lsn = InvalidXLogRecPtr;
+		slot->candidate_xmin = InvalidTransactionId;
+		slot->candidate_restart_decoding = InvalidXLogRecPtr;
+		slot->in_use = true;
+		slot->active = false;
+		restored = true;
+
+		/*
+		 * FIXME: Do some validation here.
+		 */
+		break;
+	}
+
+	if (!restored)
+		ereport(PANIC,
+				(errmsg("too many logical slots active before shutdown, increase max_logical_slots and try again")));
+
+	END_CRIT_SECTION();
+}
+
+
+static void
+LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin)
+{
+	/* lookup symbols in the shared libarary */
+
+	/* optional */
+	callbacks->init_cb = (LogicalDecodeInitCB)
+		load_external_function(plugin, "pg_decode_init", false, NULL);
+
+	/* required */
+	callbacks->begin_cb = (LogicalDecodeBeginCB)
+		load_external_function(plugin, "pg_decode_begin_txn", true, NULL);
+
+	/* required */
+	callbacks->change_cb = (LogicalDecodeChangeCB)
+		load_external_function(plugin, "pg_decode_change", true, NULL);
+
+	/* required */
+	callbacks->commit_cb = (LogicalDecodeCommitCB)
+		load_external_function(plugin, "pg_decode_commit_txn", true, NULL);
+
+	/* optional */
+	callbacks->cleanup_cb = (LogicalDecodeCleanupCB)
+		load_external_function(plugin, "pg_decode_clean", false, NULL);
+}
+
+/*
+ * Context management functions to make coordination between the different
+ * logical decoding pieces.
+ */
+
+/*
+ * Callbacks for ReorderBuffer which add in some more information and then call
+ * output_plugin.h plugins.
+ */
+static void
+begin_txn_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+
+	ctx->callbacks.begin_cb(ctx, txn);
+}
+
+static void
+commit_txn_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, XLogRecPtr commit_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+
+	ctx->callbacks.commit_cb(ctx, txn, commit_lsn);
+}
+
+static void
+change_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+			   Relation relation, ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+
+	ctx->callbacks.change_cb(ctx, txn, relation, change);
+}
+
+LogicalDecodingContext *
+CreateLogicalDecodingContext(LogicalDecodingSlot *slot,
+							 bool is_init,
+							 XLogRecPtr	start_lsn,
+							 List *output_plugin_options,
+							 XLogPageReadCB read_page,
+						 LogicalOutputPluginWriterPrepareWrite prepare_write,
+							 LogicalOutputPluginWriterWrite do_write)
+{
+	MemoryContext context;
+	MemoryContext old_context;
+	TransactionId xmin_horizon;
+	LogicalDecodingContext *ctx;
+
+	context = AllocSetContextCreate(TopMemoryContext,
+									"ReorderBuffer",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	old_context = MemoryContextSwitchTo(context);
+	ctx = palloc0(sizeof(LogicalDecodingContext));
+
+
+	/* load output plugins first, so we detect a wrong output plugin early */
+	LoadOutputPlugin(&ctx->callbacks, NameStr(slot->plugin));
+
+	if (is_init && start_lsn != InvalidXLogRecPtr)
+		elog(ERROR, "cannot initially start at a specified lsn");
+
+	if (is_init)
+		xmin_horizon = slot->xmin;
+	else
+		xmin_horizon = InvalidTransactionId;
+
+	ctx->slot = slot;
+
+	ctx->reader = XLogReaderAllocate(read_page, ctx);
+	ctx->reader->private_data = ctx;
+
+	ctx->reorder = ReorderBufferAllocate();
+	ctx->snapshot_builder =
+		AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn);
+
+	ctx->reorder->private_data = ctx;
+
+	ctx->reorder->begin = begin_txn_wrapper;
+	ctx->reorder->apply_change = change_wrapper;
+	ctx->reorder->commit = commit_txn_wrapper;
+
+	ctx->out = makeStringInfo();
+	ctx->prepare_write = prepare_write;
+	ctx->write = do_write;
+
+	ctx->output_plugin_options = output_plugin_options;
+
+	if (is_init)
+		ctx->stop_after_consistent = true;
+	else
+		ctx->stop_after_consistent = false;
+
+	/* call output plugin initialization callback */
+	if (ctx->callbacks.init_cb != NULL)
+		ctx->callbacks.init_cb(ctx, is_init);
+
+	MemoryContextSwitchTo(old_context);
+
+	return ctx;
+}
+
+void
+FreeLogicalDecodingContext(LogicalDecodingContext *ctx)
+{
+	if (ctx->callbacks.cleanup_cb != NULL)
+		ctx->callbacks.cleanup_cb(ctx);
+}
+
+
+/* has the initial snapshot found a consistent state? */
+bool
+LogicalDecodingContextReady(LogicalDecodingContext *ctx)
+{
+	return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT;
+}
diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c
new file mode 100644
index 0000000..9837a95
--- /dev/null
+++ b/src/backend/replication/logical/logicalfuncs.c
@@ -0,0 +1,361 @@
+/*-------------------------------------------------------------------------
+ *
+ * logicalfuncs.c
+ *
+ *	   Support functions for using xlog decoding
+ *
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logicalfuncs.c
+ *
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "storage/fd.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
+
+Datum		init_logical_replication(PG_FUNCTION_ARGS);
+Datum		stop_logical_replication(PG_FUNCTION_ARGS);
+Datum		pg_stat_get_logical_decoding_slots(PG_FUNCTION_ARGS);
+
+/* FIXME: duplicate code with pg_xlogdump, similar to walsender.c */
+static void
+XLogRead(char *buf, XLogRecPtr startptr, Size count)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	static int	sendFile = -1;
+	static XLogSegNo sendSegNo = 0;
+	static uint32 sendOff = 0;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = recptr % XLogSegSize;
+
+		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+		{
+			char		path[MAXPGPATH];
+
+			/* Switch to another logfile segment */
+			if (sendFile >= 0)
+				close(sendFile);
+
+			XLByteToSeg(recptr, sendSegNo);
+
+			XLogFilePath(path, ThisTimeLineID, sendSegNo);
+
+			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+
+			if (sendFile < 0)
+			{
+				if (errno == ENOENT)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("requested WAL segment %s has already been removed",
+									path)));
+				else
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open file \"%s\": %m",
+									path)));
+			}
+			sendOff = 0;
+		}
+
+		/* Need to seek in the file? */
+		if (sendOff != startoff)
+		{
+			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
+			{
+				char		path[MAXPGPATH];
+
+				XLogFilePath(path, ThisTimeLineID, sendSegNo);
+
+				ereport(ERROR,
+						(errcode_for_file_access(),
+				  errmsg("could not seek in log segment %s to offset %u: %m",
+						 path, startoff)));
+			}
+			sendOff = startoff;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (XLogSegSize - startoff))
+			segbytes = XLogSegSize - startoff;
+		else
+			segbytes = nbytes;
+
+		readbytes = read(sendFile, p, segbytes);
+		if (readbytes <= 0)
+		{
+			char		path[MAXPGPATH];
+
+			XLogFilePath(path, ThisTimeLineID, sendSegNo);
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from log segment %s, offset %u, length %lu: %m",
+							path, sendOff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+
+		sendOff += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+}
+
+int
+logical_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+	int reqLen, XLogRecPtr targetRecPtr, char *cur_page, TimeLineID *pageTLI)
+{
+	XLogRecPtr	flushptr,
+				loc;
+	int			count;
+
+	loc = targetPagePtr + reqLen;
+	while (1)
+	{
+		flushptr = GetFlushRecPtr();
+		if (loc <= flushptr)
+			break;
+		pg_usleep(1000L);
+	}
+
+	/* more than one block available */
+	if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+		count = XLOG_BLCKSZ;
+	/* not enough data there */
+	else if (targetPagePtr + reqLen > flushptr)
+		return -1;
+	/* part of the page available */
+	else
+		count = flushptr - targetPagePtr;
+
+	/* FIXME: more sensible/efficient implementation */
+	XLogRead(cur_page, targetPagePtr, XLOG_BLCKSZ);
+
+	return count;
+}
+
+static void
+DummyWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid)
+{
+	elog(ERROR, "init_logical_replication shouldn't be writing anything");
+}
+
+Datum
+init_logical_replication(PG_FUNCTION_ARGS)
+{
+	Name		name = PG_GETARG_NAME(0);
+	Name		plugin = PG_GETARG_NAME(1);
+
+	char		xpos[MAXFNAMELEN];
+
+	TupleDesc	tupdesc;
+	HeapTuple	tuple;
+	Datum		result;
+	Datum		values[2];
+	bool		nulls[2];
+	LogicalDecodingContext *ctx = NULL;
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	/* Acquire a logical replication slot */
+	CheckLogicalReplicationRequirements();
+	LogicalDecodingAcquireFreeSlot(NameStr(*name), NameStr(*plugin));
+
+	/* make sure we don't end up with an unreleased slot */
+	PG_TRY();
+	{
+		XLogRecPtr	startptr;
+
+		/*
+		 * Use the same initial_snapshot_reader, but with our own read_page
+		 * callback that does not depend on walsender.
+		 */
+		ctx = CreateLogicalDecodingContext(MyLogicalDecodingSlot, true,
+										   InvalidXLogRecPtr, NIL,
+										   logical_read_local_xlog_page,
+										   DummyWrite, DummyWrite);
+
+		/* setup from where to read xlog */
+		startptr = ctx->slot->restart_decoding;
+
+		/* Wait for a consistent starting point */
+		for (;;)
+		{
+			XLogRecord *record;
+			XLogRecordBuffer buf;
+			char	   *err = NULL;
+
+			/* the read_page callback waits for new WAL */
+			record = XLogReadRecord(ctx->reader, startptr, &err);
+			if (err)
+				elog(ERROR, "%s", err);
+
+			Assert(record);
+
+			startptr = InvalidXLogRecPtr;
+
+			buf.origptr = ctx->reader->ReadRecPtr;
+			buf.record = *record;
+			buf.record_data = XLogRecGetData(record);
+			DecodeRecordIntoReorderBuffer(ctx, &buf);
+
+			/* only continue till we found a consistent spot */
+			if (LogicalDecodingContextReady(ctx))
+				break;
+		}
+
+		/* Extract the values we want */
+		MyLogicalDecodingSlot->confirmed_flush = ctx->reader->EndRecPtr;
+		snprintf(xpos, sizeof(xpos), "%X/%X",
+				 (uint32) (MyLogicalDecodingSlot->confirmed_flush >> 32),
+				 (uint32) MyLogicalDecodingSlot->confirmed_flush);
+	}
+	PG_CATCH();
+	{
+		LogicalDecodingReleaseSlot();
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	values[0] = CStringGetTextDatum(NameStr(MyLogicalDecodingSlot->name));
+	values[1] = CStringGetTextDatum(xpos);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+	result = HeapTupleGetDatum(tuple);
+
+	LogicalDecodingReleaseSlot();
+
+	PG_RETURN_DATUM(result);
+}
+
+Datum
+stop_logical_replication(PG_FUNCTION_ARGS)
+{
+	Name		name = PG_GETARG_NAME(0);
+
+	CheckLogicalReplicationRequirements();
+	LogicalDecodingFreeSlot(NameStr(*name));
+
+	PG_RETURN_INT32(0);
+}
+
+/*
+ * Return one row for each logical replication slot currently in use.
+ */
+
+Datum
+pg_stat_get_logical_decoding_slots(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_LOGICAL_DECODING_SLOTS_COLS 6
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int			i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < max_logical_slots; i++)
+	{
+		LogicalDecodingSlot *slot = &LogicalDecodingCtl->logical_slots[i];
+		Datum		values[PG_STAT_GET_LOGICAL_DECODING_SLOTS_COLS];
+		bool		nulls[PG_STAT_GET_LOGICAL_DECODING_SLOTS_COLS];
+		char		location[MAXFNAMELEN];
+		const char *slot_name;
+		const char *plugin;
+		TransactionId xmin;
+		XLogRecPtr	last_req;
+		bool		active;
+		Oid			database;
+
+		SpinLockAcquire(&slot->mutex);
+		if (!slot->in_use)
+		{
+			SpinLockRelease(&slot->mutex);
+			continue;
+		}
+		else
+		{
+			xmin = slot->xmin;
+			active = slot->active;
+			database = slot->database;
+			last_req = slot->restart_decoding;
+			slot_name = pstrdup(NameStr(slot->name));
+			plugin = pstrdup(NameStr(slot->plugin));
+		}
+		SpinLockRelease(&slot->mutex);
+
+		memset(nulls, 0, sizeof(nulls));
+
+		snprintf(location, sizeof(location), "%X/%X",
+				 (uint32) (last_req >> 32), (uint32) last_req);
+
+		values[0] = CStringGetTextDatum(slot_name);
+		values[1] = CStringGetTextDatum(plugin);
+		values[2] = database;
+		values[3] = BoolGetDatum(active);
+		values[4] = TransactionIdGetDatum(xmin);
+		values[5] = CStringGetTextDatum(location);
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
new file mode 100644
index 0000000..b6df411
--- /dev/null
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -0,0 +1,2548 @@
+/*-------------------------------------------------------------------------
+ *
+ * reorderbuffer.c
+ *
+ * PostgreSQL logical replay buffer management
+ *
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/reorderbuffer.c
+ *
+ * NOTES
+ *	  This module gets handed individual pieces of transactions in the order
+ *	  they are written to the WAL and is responsible to reassemble them into
+ *	  toplevel transaction sized pieces. When a transaction is completely
+ *	  reassembled - signalled by reading the transaction commit record - it
+ *	  will then call the output plugin (c.f. ReorderBufferCommit()) with the
+ *	  individual changes. The output plugins rely on snapshots built by
+ *	  snapbuild.c which hands them to us.
+ *
+ *	  Transactions and subtransactions/savepoints in postgres are not
+ *	  immediately linked to each other from outside the performing
+ *	  backend. Only at commit/abort (or special xact_assignment records) they
+ *	  are linked together. Which means that we will have to splice together a
+ *	  toplevel transaction from its subtransactions. To do that efficiently we
+ *	  build a binary heap indexed by the smallest current lsn of the individual
+ *	  subtransactions' changestreams. As the individual streams are inherently
+ *	  ordered by LSN - since that is where we build them from - the transaction
+ *	  can easily be reassembled by always using the subtransaction with the
+ *	  smallest current LSN from the heap.
+ *
+ *	  In order to cope with large transactions - which can be several times as
+ *	  big as the available memory - this module supports spooling the contents
+ *	  of a large transactions to disk. When the transaction is replayed the
+ *	  contents of individual (sub-)transactions will be read from disk in
+ *	  chunks.
+ *
+ *	  This module also has to deal with reassembling toast records from the
+ *	  individual chunks stored in WAL. When a new (or initial) version of a
+ *	  tuple is stored in WAL it will always be preceded by the toast chunks
+ *	  emitted for the columns stored out of line. Within a single toplevel
+ *	  transaction there will be no other data carrying records between a row's
+ *	  toast chunks and the row data itself. See ReorderBufferToast* for
+ *	  details.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "common/relpath.h"
+
+#include "lib/binaryheap.h"
+
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
+#include "replication/logical.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/sinval.h"
+
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/relcache.h"
+#include "utils/relfilenodemap.h"
+#include "utils/resowner.h"
+#include "utils/tqual.h"
+
+/*
+ * For efficiency and simplicity reasons we want to keep Snapshots, CommandIds
+ * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE
+ * changes. We don't want to leak those internal values to external users
+ * though (they would just use switch()...default:) because that would make it
+ * harder to add to new user visible values.
+ *
+ * This needs to be synchronized with ReorderBufferChangeType! Adjust the
+ * StaticAssertExpr's in ReorderBufferAllocate if you add anything!
+ */
+typedef enum
+{
+	REORDER_BUFFER_CHANGE_INTERNAL_INSERT,
+	REORDER_BUFFER_CHANGE_INTERNAL_UPDATE,
+	REORDER_BUFFER_CHANGE_INTERNAL_DELETE,
+	REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT,
+	REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID,
+	REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
+} ReorderBufferChangeTypeInternal;
+
+/* entry for a hash table we use to map from xid to our transaction state */
+typedef struct ReorderBufferTXNByIdEnt
+{
+	TransactionId xid;
+	ReorderBufferTXN *txn;
+} ReorderBufferTXNByIdEnt;
+
+/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
+typedef struct ReorderBufferTupleCidKey
+{
+	RelFileNode relnode;
+	ItemPointerData tid;
+} ReorderBufferTupleCidKey;
+
+typedef struct ReorderBufferTupleCidEnt
+{
+	ReorderBufferTupleCidKey key;
+	CommandId	cmin;
+	CommandId	cmax;
+	CommandId	combocid;		/* just for debugging */
+} ReorderBufferTupleCidEnt;
+
+/* k-way in-order change iteration support structures */
+typedef struct ReorderBufferIterTXNEntry
+{
+	XLogRecPtr	lsn;
+	ReorderBufferChange *change;
+	ReorderBufferTXN *txn;
+	int			fd;
+	XLogSegNo	segno;
+} ReorderBufferIterTXNEntry;
+
+typedef struct ReorderBufferIterTXNState
+{
+	binaryheap *heap;
+	Size		nr_txns;
+	dlist_head	old_change;
+	ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
+} ReorderBufferIterTXNState;
+
+/* toast datastructures */
+typedef struct ReorderBufferToastEnt
+{
+	Oid			chunk_id;		/* toast_table.chunk_id */
+	int32		last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
+								 * have seen */
+	Size		num_chunks;		/* number of chunks we've already seen */
+	Size		size;			/* combined size of chunks seen */
+	dlist_head	chunks;			/* linked list of chunks */
+	struct varlena *reconstructed;		/* reconstructed varlena now pointed
+										 * to in main tup */
+} ReorderBufferToastEnt;
+
+
+/* number of changes kept in memory, per transaction */
+const Size	max_memtries = 4096;
+
+/* Size of the slab caches used for frequently allocated objects */
+const Size	max_cached_changes = 4096 * 2;
+const Size	max_cached_tuplebufs = 1024;		/* ~8MB */
+const Size	max_cached_transactions = 512;
+
+
+/* ---------------------------------------
+ * primary reorderbuffer support routines
+ * ---------------------------------------
+ */
+static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
+static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
+					  TransactionId xid, bool create, bool *is_new,
+					  XLogRecPtr lsn, bool create_as_top);
+
+static void AssertTXNLsnOrder(ReorderBuffer *rb);
+
+/* ---------------------------------------
+ * support functions for lsn-order iterating over the ->changes of a
+ * transaction and its subtransactions
+ *
+ * used for iteration over the k-way heap merge of a transaction and its
+ * subtransactions
+ * ---------------------------------------
+ */
+static ReorderBufferIterTXNState *ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferChange *
+			ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
+static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state);
+static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+/*
+ * ---------------------------------------
+ * Disk serialization support functions
+ * ---------------------------------------
+ */
+static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change);
+static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno);
+static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *change);
+static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
+static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid);
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+/* Size of an EXTERNAL datum that contains a standard TOAST pointer */
+#define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_external))
+
+/* Size of an indirect datum that contains a standard TOAST pointer */
+#define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_indirect))
+
+static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change);
+static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change);
+
+
+/*
+ * Allocate a new ReorderBuffer
+ */
+ReorderBuffer *
+ReorderBufferAllocate(void)
+{
+	ReorderBuffer *buffer;
+	HASHCTL		hash_ctl;
+	MemoryContext new_ctx;
+
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_INSERT == (int) REORDER_BUFFER_CHANGE_INSERT, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_UPDATE == (int) REORDER_BUFFER_CHANGE_UPDATE, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_DELETE == (int) REORDER_BUFFER_CHANGE_DELETE, "out of sync enums");
+
+	new_ctx = AllocSetContextCreate(TopMemoryContext,
+									"ReorderBuffer",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
+	buffer = (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	buffer->context = new_ctx;
+
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = buffer->context;
+
+	buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
+								 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	buffer->by_txn_last_xid = InvalidTransactionId;
+	buffer->by_txn_last_txn = NULL;
+
+	buffer->nr_cached_transactions = 0;
+	buffer->nr_cached_changes = 0;
+	buffer->nr_cached_tuplebufs = 0;
+
+	buffer->outbuf = NULL;
+	buffer->outbufsize = 0;
+
+	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
+
+	dlist_init(&buffer->toplevel_by_lsn);
+	dlist_init(&buffer->cached_transactions);
+	dlist_init(&buffer->cached_changes);
+	slist_init(&buffer->cached_tuplebufs);
+
+	return buffer;
+}
+
+/*
+ * Free a ReorderBuffer
+ */
+void
+ReorderBufferFree(ReorderBuffer *rb)
+{
+	MemoryContext context = rb->context;
+
+	/*
+	 * We free separately allocated data by entirely scrapping oure personal
+	 * memory context.
+	 */
+	MemoryContextDelete(context);
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTXN.
+ */
+static ReorderBufferTXN *
+ReorderBufferGetTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	if (rb->nr_cached_transactions > 0)
+	{
+		rb->nr_cached_transactions--;
+		txn = (ReorderBufferTXN *)
+			dlist_container(ReorderBufferTXN, node,
+							dlist_pop_head_node(&rb->cached_transactions));
+	}
+	else
+	{
+		txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
+	}
+
+	memset(txn, 0, sizeof(ReorderBufferTXN));
+
+	dlist_init(&txn->changes);
+	dlist_init(&txn->tuplecids);
+	dlist_init(&txn->subtxns);
+
+	return txn;
+}
+
+/*
+ * Free an ReorderBufferTXN. Deallocation might be delayed for efficiency
+ * purposes.
+ */
+void
+ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/* clean the lookup cache if we were cached (quite likely) */
+	if (rb->by_txn_last_xid == txn->xid)
+	{
+		rb->by_txn_last_xid = InvalidTransactionId;
+		rb->by_txn_last_txn = NULL;
+	}
+
+	if (txn->tuplecid_hash != NULL)
+	{
+		hash_destroy(txn->tuplecid_hash);
+		txn->tuplecid_hash = NULL;
+	}
+
+	if (txn->invalidations)
+	{
+		pfree(txn->invalidations);
+		txn->invalidations = NULL;
+	}
+
+	if (rb->nr_cached_transactions < max_cached_transactions)
+	{
+		rb->nr_cached_transactions++;
+		dlist_push_head(&rb->cached_transactions, &txn->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
+		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
+	}
+	else
+	{
+		pfree(txn);
+	}
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferChange.
+ */
+ReorderBufferChange *
+ReorderBufferGetChange(ReorderBuffer *rb)
+{
+	ReorderBufferChange *change;
+
+	if (rb->nr_cached_changes)
+	{
+		rb->nr_cached_changes--;
+		change = (ReorderBufferChange *)
+			dlist_container(ReorderBufferChange, node,
+							dlist_pop_head_node(&rb->cached_changes));
+	}
+	else
+	{
+		change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
+	}
+
+	memset(change, 0, sizeof(ReorderBufferChange));
+	return change;
+}
+
+/*
+ * Free an ReorderBufferChange. Deallocation might be delayed for efficiency
+ * purposes.
+ */
+void
+ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
+{
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->newtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->newtuple);
+				change->newtuple = NULL;
+			}
+
+			if (change->oldtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->oldtuple);
+				change->oldtuple = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			if (change->snapshot)
+			{
+				ReorderBufferFreeSnap(rb, change->snapshot);
+				change->snapshot = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	if (rb->nr_cached_changes < max_cached_changes)
+	{
+		rb->nr_cached_changes++;
+		dlist_push_head(&rb->cached_changes, &change->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
+		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
+	}
+	else
+	{
+		pfree(change);
+	}
+}
+
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTupleBuf
+ */
+ReorderBufferTupleBuf *
+ReorderBufferGetTupleBuf(ReorderBuffer *rb)
+{
+	ReorderBufferTupleBuf *tuple;
+
+	if (rb->nr_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs--;
+		tuple = slist_container(ReorderBufferTupleBuf, node,
+								slist_pop_head_node(&rb->cached_tuplebufs));
+#ifdef USE_ASSERT_CHECKING
+		memset(tuple, 0xdeadbeef, sizeof(ReorderBufferTupleBuf));
+#endif
+	}
+	else
+	{
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTupleBuf));
+	}
+
+	return tuple;
+}
+
+/*
+ * Free an ReorderBufferTupleBuf. Deallocation might be delayed for efficiency
+ * purposes.
+ */
+void
+ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
+{
+	if (rb->nr_cached_tuplebufs < max_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs++;
+		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
+		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
+	}
+	else
+	{
+		pfree(tuple);
+	}
+}
+
+/*
+ * Return the ReorderBufferTXN from the given buffer, specified by Xid.
+ * If create is true, and a transaction doesn't already exist, create it
+ * (with the given LSN, and as top transaction if that's specified);
+ * when this happens, is_new is set to true.
+ */
+static ReorderBufferTXN *
+ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
+					  bool *is_new, XLogRecPtr lsn, bool create_as_top)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXNByIdEnt *ent;
+	bool		found;
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(!create || lsn != InvalidXLogRecPtr);
+
+	/*
+	 * Check the one-entry lookup cache first
+	 */
+	if (TransactionIdIsValid(rb->by_txn_last_xid) &&
+		rb->by_txn_last_xid == xid)
+	{
+		txn = rb->by_txn_last_txn;
+
+		if (txn != NULL)
+		{
+			/* found it, and it's valid */
+			if (is_new)
+				*is_new = false;
+			return txn;
+		}
+
+		/*
+		 * cached as non-existant, and asked not to create? Then nothing else
+		 * to do.
+		 */
+		if (!create)
+			return NULL;
+		/* otherwise fall through to create it */
+	}
+
+	/*
+	 * If the cache wasn't hit or it yielded an "does-not-exist" and we want
+	 * to create an entry.
+	 */
+
+	/* search the lookup table */
+	ent = (ReorderBufferTXNByIdEnt *)
+		hash_search(rb->by_txn,
+					(void *) &xid,
+					create ? HASH_ENTER : HASH_FIND,
+					&found);
+	if (found)
+		txn = ent->txn;
+	else if (create)
+	{
+		/* initialize the new entry, if creation was requested */
+		Assert(ent != NULL);
+
+		ent->txn = ReorderBufferGetTXN(rb);
+		ent->txn->xid = xid;
+		txn = ent->txn;
+		txn->first_lsn = lsn;
+		txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
+
+		if (create_as_top)
+		{
+			dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
+			AssertTXNLsnOrder(rb);
+		}
+	}
+	else
+		txn = NULL;				/* not found and not asked to create */
+
+	/* update cache */
+	rb->by_txn_last_xid = xid;
+	rb->by_txn_last_txn = txn;
+
+	if (is_new)
+		*is_new = !found;
+
+	Assert(!create || !!txn);
+	return txn;
+}
+
+/*
+ * Queue a change into a transaction so it can be replayed upon commit.
+ */
+void
+ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
+					   ReorderBufferChange *change)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->lsn = lsn;
+	Assert(InvalidXLogRecPtr != lsn);
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries++;
+	txn->nentries_mem++;
+
+	ReorderBufferCheckSerializeTXN(rb, txn);
+}
+
+static void
+AssertTXNLsnOrder(ReorderBuffer *rb)
+{
+#ifdef USE_ASSERT_CHECKING
+	dlist_iter	iter;
+	XLogRecPtr	prev_first_lsn = InvalidXLogRecPtr;
+
+	dlist_foreach(iter, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
+
+		if (cur_txn->end_lsn != InvalidXLogRecPtr)
+			Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
+
+		if (prev_first_lsn != InvalidXLogRecPtr)
+			Assert(prev_first_lsn < cur_txn->first_lsn);
+
+		Assert(!cur_txn->is_known_as_subxact);
+		prev_first_lsn = cur_txn->first_lsn;
+	}
+#endif
+}
+
+ReorderBufferTXN *
+ReorderBufferGetOldestTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	if (dlist_is_empty(&rb->toplevel_by_lsn))
+		return NULL;
+
+	AssertTXNLsnOrder(rb);
+
+	txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
+
+	Assert(!txn->is_known_as_subxact);
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	return txn;
+}
+
+void
+ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
+{
+	rb->current_restart_decoding_lsn = ptr;
+}
+
+void
+ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+	bool		new_top;
+	bool		new_sub;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
+	subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
+
+	if (new_sub)
+	{
+		/*
+		 * we assign subtransactions to top level transaction even if we don't
+		 * have data for it yet, assignment records frequently reference xids
+		 * that have not yet produced any records. Knowing those aren't top
+		 * level xids allows us to make processing cheaper in some places.
+		 */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to toplevel transaction */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (new_top)
+	{
+		elog(ERROR, "existing subxact assigned to unknown toplevel xact");
+	}
+}
+
+/*
+ * Associate a subtransaction with its toplevel transaction at commit
+ * time. There may be no further changes added after this.
+ */
+void
+ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr commit_lsn,
+						 XLogRecPtr end_lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+
+	subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
+								   InvalidXLogRecPtr, false);
+
+	/*
+	 * No need to do anything if that subtxn didn't contain any changes
+	 */
+	if (!subtxn)
+		return;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, true);
+
+	if (txn == NULL)
+		elog(ERROR, "subxact logged without previous toplevel record");
+
+	subtxn->final_lsn = commit_lsn;
+	subtxn->end_lsn = end_lsn;
+
+	if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to subtransaction list */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+}
+
+
+/*
+ * Support for efficiently iterating over a transaction's and its
+ * subtransactions' changes.
+ *
+ * We do by doing a k-way merge between transactions/subtransactions. For that
+ * we model the current heads of the different transactions as a binary heap so
+ * we easily know which (sub-)transaction has the change with the smallest lsn
+ * next.
+ *
+ * We assume the changes in individual transactions are already sorted by LSN.
+ */
+
+/*
+ * Binary heap comparison function.
+ */
+static int
+ReorderBufferIterCompare(Datum a, Datum b, void *arg)
+{
+	ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
+	XLogRecPtr	pos_a = state->entries[DatumGetInt32(a)].lsn;
+	XLogRecPtr	pos_b = state->entries[DatumGetInt32(b)].lsn;
+
+	if (pos_a < pos_b)
+		return 1;
+	else if (pos_a == pos_b)
+		return 0;
+	return -1;
+}
+
+/*
+ * Allocate & initialize an iterator which iterates in lsn order over a
+ * transaction and all its subtransactions.
+ */
+static ReorderBufferIterTXNState *
+ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	Size		nr_txns = 0;
+	ReorderBufferIterTXNState *state;
+	dlist_iter	cur_txn_i;
+	int32		off;
+
+	/*
+	 * Calculate the size of our heap: one element for every transaction that
+	 * contains changes.  (Besides the transactions already in the reorder
+	 * buffer, we count the one we were directly passed.)
+	 */
+	if (txn->nentries > 0)
+		nr_txns++;
+
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+			nr_txns++;
+	}
+
+	/*
+	 * XXX: Add fastpath for the rather common nr_txns=1 case, no need to
+	 * allocate/build a heap in that case.
+	 */
+
+	/* allocate iteration state */
+	state = (ReorderBufferIterTXNState *)
+		MemoryContextAllocZero(rb->context,
+							   sizeof(ReorderBufferIterTXNState) +
+							   sizeof(ReorderBufferIterTXNEntry) * nr_txns);
+
+	state->nr_txns = nr_txns;
+	dlist_init(&state->old_change);
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		state->entries[off].fd = -1;
+		state->entries[off].segno = 0;
+	}
+
+	/* allocate heap */
+	state->heap = binaryheap_allocate(state->nr_txns, ReorderBufferIterCompare,
+									  state);
+
+	/*
+	 * Now insert items into the binary heap, unordered.  (We will run a heap
+	 * assembly step at the end; this is more efficient.)
+	 */
+
+	off = 0;
+
+	/* add toplevel transaction if it contains changes */
+	if (txn->nentries > 0)
+	{
+		ReorderBufferChange *cur_change;
+
+		if (txn->nentries != txn->nentries_mem)
+			ReorderBufferRestoreChanges(rb, txn, &state->entries[off].fd,
+										&state->entries[off].segno);
+
+		cur_change = dlist_head_element(ReorderBufferChange, node,
+										&txn->changes);
+
+		state->entries[off].lsn = cur_change->lsn;
+		state->entries[off].change = cur_change;
+		state->entries[off].txn = txn;
+
+		binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+	}
+
+	/* add subtransactions if they contain changes */
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+		{
+			ReorderBufferChange *cur_change;
+
+			if (txn->nentries != txn->nentries_mem)
+				ReorderBufferRestoreChanges(rb, cur_txn,
+											&state->entries[off].fd,
+											&state->entries[off].segno);
+
+			cur_change = dlist_head_element(ReorderBufferChange, node,
+											&cur_txn->changes);
+
+			state->entries[off].lsn = cur_change->lsn;
+			state->entries[off].change = cur_change;
+			state->entries[off].txn = cur_txn;
+
+			binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+		}
+	}
+
+	/* assemble a valid binary heap */
+	binaryheap_build(state->heap);
+
+	return state;
+}
+
+/*
+ * FIXME: better comment and/or name
+ */
+static void
+ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	XLogSegNo	first;
+	XLogSegNo	cur;
+	XLogSegNo	last;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	XLByteToSeg(txn->first_lsn, first);
+	XLByteToSeg(txn->final_lsn, last);
+
+	for (cur = first; cur <= last; cur++)
+	{
+		char		path[MAXPGPATH];
+		XLogRecPtr	recptr;
+
+		XLogSegNoOffsetToRecPtr(cur, 0, recptr);
+
+		sprintf(path, "pg_llog/%s/xid-%u-lsn-%X-%X.snap",
+				NameStr(MyLogicalDecodingSlot->name), txn->xid,
+				(uint32) (recptr >> 32), (uint32) recptr);
+		if (unlink(path) != 0 && errno != ENOENT)
+			elog(FATAL, "could not unlink file \"%s\": %m", path);
+	}
+}
+
+/*
+ * Return the next change when iterating over a transaction and its
+ * subtransaction.
+ *
+ * Returns NULL when no further changes exist.
+ */
+static ReorderBufferChange *
+ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
+{
+	ReorderBufferChange *change;
+	ReorderBufferIterTXNEntry *entry;
+	int32		off;
+
+	/* nothing there anymore */
+	if (state->heap->bh_size == 0)
+		return NULL;
+
+	off = DatumGetInt32(binaryheap_first(state->heap));
+	entry = &state->entries[off];
+
+	if (!dlist_is_empty(&entry->txn->subtxns))
+		elog(LOG, "tx with subtxn %u", entry->txn->xid);
+
+	/* free memory we might have "leaked" in the previous *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	change = entry->change;
+
+	/*
+	 * update heap with information about which transaction has the next
+	 * relevant change in LSN order
+	 */
+
+	/* there are in-memory changes */
+	if (dlist_has_next(&entry->txn->changes, &entry->change->node))
+	{
+		dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
+		ReorderBufferChange *next_change =
+		dlist_container(ReorderBufferChange, node, next);
+
+		/* txn stays the same */
+		state->entries[off].lsn = next_change->lsn;
+		state->entries[off].change = next_change;
+
+		binaryheap_replace_first(state->heap, Int32GetDatum(off));
+		return change;
+	}
+
+	/* try to load changes from disk */
+	if (entry->txn->nentries != entry->txn->nentries_mem)
+	{
+		/*
+		 * Ugly: restoring changes will reuse *Change records, thus delete the
+		 * current one from the per-tx list and only free in the next call.
+		 */
+		dlist_delete(&change->node);
+		dlist_push_tail(&state->old_change, &change->node);
+
+		if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->fd,
+										&state->entries[off].segno))
+		{
+			/* successfully restored changes from disk */
+			ReorderBufferChange *next_change =
+			dlist_head_element(ReorderBufferChange, node,
+							   &entry->txn->changes);
+
+			elog(DEBUG2, "restored %zu/%zu changes from disk",
+				 entry->txn->nentries_mem, entry->txn->nentries);
+			Assert(entry->txn->nentries_mem);
+			/* txn stays the same */
+			state->entries[off].lsn = next_change->lsn;
+			state->entries[off].change = next_change;
+			binaryheap_replace_first(state->heap, Int32GetDatum(off));
+
+			return change;
+		}
+	}
+
+	/* ok, no changes there anymore, remove */
+	binaryheap_remove_first(state->heap);
+
+	return change;
+}
+
+/*
+ * Deallocate the iterator
+ */
+static void
+ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state)
+{
+	int32		off;
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		if (state->entries[off].fd != -1)
+			CloseTransientFile(state->entries[off].fd);
+	}
+
+	/* free memory we might have "leaked" in the last *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	binaryheap_free(state->heap);
+	pfree(state);
+}
+
+/*
+ * Cleanup the contents of a transaction, usually after the transaction
+ * committed or aborted.
+ */
+static void
+ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	bool		found;
+	dlist_mutable_iter iter;
+
+	/* cleanup subtransactions & their changes */
+	dlist_foreach_modify(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		Assert(subtxn->is_known_as_subxact);
+		Assert(subtxn->nsubtxns == 0);
+
+		/*
+		 * subtransactions are always associated to the toplevel TXN, even if
+		 * they originally were happening inside another subtxn, so we won't
+		 * ever recurse more than one level here.
+		 */
+		ReorderBufferCleanupTXN(rb, subtxn);
+	}
+
+	/* cleanup changes in the toplevel txn */
+	dlist_foreach_modify(iter, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	/*
+	 * cleanup the tuplecids we stored timetravel access. They are always
+	 * stored in the toplevel transaction.
+	 */
+	dlist_foreach_modify(iter, &txn->tuplecids)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	if (txn->base_snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(txn->base_snapshot);
+		txn->base_snapshot = NULL;
+	}
+
+	/* delete from list of known subxacts */
+	if (txn->is_known_as_subxact)
+	{
+		dlist_delete(&txn->node);
+	}
+	/* delete from LSN ordered list of toplevel TXNs */
+	else
+	{
+		/* FIXME: adjust nsubxacts count of parent */
+		dlist_delete(&txn->node);
+	}
+
+	/* now remove reference from buffer */
+	hash_search(rb->by_txn,
+				(void *) &txn->xid,
+				HASH_REMOVE,
+				&found);
+	Assert(found);
+
+	/* remove entries spilled to disk */
+	if (txn->nentries != txn->nentries_mem)
+		ReorderBufferRestoreCleanup(rb, txn);
+
+	/* deallocate */
+	ReorderBufferReturnTXN(rb, txn);
+}
+
+/*
+ * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
+ * tqual.c's HeapTupleSatisfiesMVCCDuringDecoding.
+ */
+static void
+ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	iter;
+	HASHCTL		hash_ctl;
+
+	if (!txn->does_timetravel || dlist_is_empty(&txn->tuplecids))
+		return;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
+	hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+
+	/*
+	 * create the hash with the exact number of to-be-stored tuplecids from
+	 * the start
+	 */
+	txn->tuplecid_hash =
+		hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	dlist_foreach(iter, &txn->tuplecids)
+	{
+		ReorderBufferTupleCidKey key;
+		ReorderBufferTupleCidEnt *ent;
+		bool		found;
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+
+		/* be careful about padding */
+		memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+		key.relnode = change->tuplecid.node;
+
+		ItemPointerCopy(&change->tuplecid.tid,
+						&key.tid);
+
+		ent = (ReorderBufferTupleCidEnt *)
+			hash_search(txn->tuplecid_hash,
+						(void *) &key,
+						HASH_ENTER | HASH_FIND,
+						&found);
+		if (!found)
+		{
+			ent->cmin = change->tuplecid.cmin;
+			ent->cmax = change->tuplecid.cmax;
+			ent->combocid = change->tuplecid.combocid;
+		}
+		else
+		{
+			Assert(ent->cmin == change->tuplecid.cmin);
+			Assert(ent->cmax == InvalidCommandId ||
+				   ent->cmax == change->tuplecid.cmax);
+
+			/*
+			 * if the tuple got valid in this transaction and now got deleted
+			 * we already have a valid cmin stored. The cmax will be
+			 * InvalidCommandId though.
+			 */
+			ent->cmax = change->tuplecid.cmax;
+		}
+	}
+}
+
+/*
+ * Copy a provided snapshot so we can modify it privately. This is needed so
+ * that catalog modifying transactions can look into intermediate catalog
+ * states.
+ */
+static Snapshot
+ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid)
+{
+	Snapshot	snap;
+	dlist_iter	iter;
+	int			i = 0;
+	Size		size;
+
+	size = sizeof(SnapshotData) +
+		sizeof(TransactionId) * orig_snap->xcnt +
+		sizeof(TransactionId) * (txn->nsubtxns + 1);
+
+	elog(DEBUG1, "copying a non-transaction-specific snapshot into timetravel tx %u", txn->xid);
+
+	snap = MemoryContextAllocZero(rb->context, size);
+	memcpy(snap, orig_snap, sizeof(SnapshotData));
+
+	snap->copied = true;
+	snap->active_count = 0;
+	snap->regd_count = 0;
+	snap->xip = (TransactionId *) (snap + 1);
+
+	memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
+
+	/*
+	 * ->subxip contains all txids that belong to our transaction which we
+	 * need to check via cmin/cmax. Thats why we store the toplevel
+	 * transaction in there as well.
+	 */
+	snap->subxip = snap->xip + snap->xcnt;
+	snap->subxip[i++] = txn->xid;
+	snap->subxcnt = txn->nsubtxns + 1;
+
+	dlist_foreach(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *sub_txn;
+
+		sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		snap->subxip[i++] = sub_txn->xid;
+	}
+
+	/* sort so we can bsearch() later */
+	qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
+
+	/* store the specified current CommandId */
+	snap->curcid = cid;
+
+	return snap;
+}
+
+/*
+ * Free a previously ReorderBufferCopySnap'ed snapshot
+ */
+static void
+ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
+{
+	if (snap->copied)
+		pfree(snap);
+	else
+		SnapBuildSnapDecRefcount(snap);
+}
+
+/*
+ * Commit a transaction and replay all actions that previously have been
+ * ReorderBufferQueueChange'd in the toplevel TX or any of the subtransactions
+ * assigned via ReorderBufferCommitChild.
+ */
+void
+ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, XLogRecPtr commit_lsn,
+					XLogRecPtr end_lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferIterTXNState *iterstate = NULL;
+	ReorderBufferChange *change;
+	CommandId	command_id = FirstCommandId;
+	volatile Snapshot snapshot_now;
+	Relation	relation = NULL;
+	Oid reloid;
+	bool is_transaction_state = IsTransactionOrTransactionBlock();
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* empty transaction */
+	if (txn == NULL)
+		return;
+
+	txn->final_lsn = commit_lsn;
+	txn->end_lsn = end_lsn;
+
+	/* serialize the last bunch of changes if we need start earlier anyway */
+	if (txn->nentries_mem != txn->nentries)
+		ReorderBufferSerializeTXN(rb, txn);
+
+	/*
+	 * If this transaction didn't have any real changes in our database, it's
+	 * OK not to have a snapshot.
+	 */
+	if (txn->base_snapshot == NULL)
+		return;
+
+	snapshot_now = txn->base_snapshot;
+
+	ReorderBufferBuildTupleCidHash(rb, txn);
+
+	/* setup initial snapshot */
+	SetupDecodingSnapshots(snapshot_now, txn->tuplecid_hash);
+
+	PG_TRY();
+	{
+		/*
+		 * Decoding needs access to syscaches et al., which in turn use
+		 * heavyweight locks and such. Thus we need to have enough state around
+		 * to keep track of those. The easiest way is to simply use a
+		 * transaction internally. That also allows us to easily enforce that
+		 * nothing writes to the database by checking for xid assignments.
+		 *
+		 * When we're called via the SQL SRF there's already a transaction
+		 * started, so start an explicit subtransaction there.
+		 */
+		if (is_transaction_state)
+			BeginInternalSubTransaction("replay");
+		else
+			StartTransactionCommand();
+
+		rb->begin(rb, txn);
+
+		iterstate = ReorderBufferIterTXNInit(rb, txn);
+		while ((change = ReorderBufferIterTXNNext(rb, iterstate)))
+		{
+			switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+			{
+				case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+				case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+				case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+					Assert(snapshot_now);
+
+					reloid = RelidByRelfilenode(change->relnode.spcNode,
+												change->relnode.relNode);
+
+					/*
+					 * catalog tuple without data, while catalog has been
+					 * rewritten
+					 */
+					if (reloid == InvalidOid &&
+						change->newtuple == NULL && change->oldtuple == NULL)
+						continue;
+					else if (reloid == InvalidOid)
+						elog(ERROR, "could not lookup relation %s",
+							 relpathperm(change->relnode, MAIN_FORKNUM));
+
+					relation = RelationIdGetRelation(reloid);
+
+					if (relation == NULL)
+						elog(ERROR, "could open relation descriptor %s",
+							 relpathperm(change->relnode, MAIN_FORKNUM));
+
+					if (RelationIsLogicallyLogged(relation))
+					{
+						/* user-triggered change */
+						if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
+						{
+						}
+						else if (!IsToastRelation(relation))
+						{
+							ReorderBufferToastReplace(rb, txn, relation, change);
+							rb->apply_change(rb, txn, relation, change);
+							ReorderBufferToastReset(rb, txn);
+						}
+						/* we're not interested in toast deletions */
+						else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
+						{
+							/*
+							 * need to reassemble change in memory, ensure it
+							 * doesn't get reused till we're done.
+							 */
+							dlist_delete(&change->node);
+							ReorderBufferToastAppendChunk(rb, txn, relation,
+														  change);
+						}
+
+					}
+					RelationClose(relation);
+					break;
+				case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+					/* XXX: we could skip snapshots in non toplevel txns */
+
+					/* get rid of the old */
+					RevertFromDecodingSnapshots();
+
+					if (snapshot_now->copied)
+					{
+						ReorderBufferFreeSnap(rb, snapshot_now);
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+
+					/*
+					 * restored from disk, we need to be careful not to double
+					 * free. We could introduce refcounting for that, but for
+					 * now this seems infrequent enough not to care.
+					 */
+					else if (change->snapshot->copied)
+					{
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+					else
+					{
+						snapshot_now = change->snapshot;
+					}
+
+
+					/* and start with the new one */
+					SetupDecodingSnapshots(snapshot_now, txn->tuplecid_hash);
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+					if (!snapshot_now->copied)
+					{
+						/* we don't use the global one anymore */
+						snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
+															 txn, command_id);
+					}
+
+					command_id = Max(command_id, change->command_id);
+
+					if (command_id != InvalidCommandId)
+					{
+						snapshot_now->curcid = command_id;
+
+						RevertFromDecodingSnapshots();
+						SetupDecodingSnapshots(snapshot_now, txn->tuplecid_hash);
+					}
+
+					/*
+					 * everytime the CommandId is incremented, we could see
+					 * new catalog contents
+					 */
+					ReorderBufferExecuteInvalidations(rb, txn);
+
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+					elog(ERROR, "tuplecid value in normal queue");
+					break;
+			}
+		}
+
+		ReorderBufferIterTXNFinish(rb, iterstate);
+
+		/* call commit callback */
+		rb->commit(rb, txn, commit_lsn);
+
+		/* make sure nothing has written anything */
+		if (GetTopTransactionIdIfAny() != InvalidTransactionId)
+			elog(ERROR, "cannot write during replay");
+
+		/*
+		 * Abort subtransaction or aborting transaction as a whole has the
+		 * right semantics. We want all locks acquired in here to be released,
+		 * not reassinged to the parent and we do not want any database access
+		 * have persistent effects.
+		 */
+		if (is_transaction_state)
+			RollbackAndReleaseCurrentSubTransaction();
+		else
+			AbortCurrentTransaction();
+
+		/* make sure there's no cache pollution */
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		/* cleanup */
+		RevertFromDecodingSnapshots();
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		ReorderBufferCleanupTXN(rb, txn);
+	}
+	PG_CATCH();
+	{
+		/* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
+		if (iterstate)
+			ReorderBufferIterTXNFinish(rb, iterstate);
+
+		if (is_transaction_state)
+			RollbackAndReleaseCurrentSubTransaction();
+		else
+			AbortCurrentTransaction();
+
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		RevertFromDecodingSnapshots();
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		/*
+		 * don't do a ReorderBufferCleanupTXN here, with the vague idea of
+		 * allowing to retry decoding.
+		 */
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Abort a transaction that possibly has previous changes. Needs to be done
+ * independently for toplevel and subtransactions.
+ */
+void
+ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* no changes in this commit */
+	if (txn == NULL)
+		return;
+
+	txn->final_lsn = lsn;
+
+	ReorderBufferCleanupTXN(rb, txn);
+}
+
+/*
+ * Check whether a transaction is already known in this module
+ */
+bool
+ReorderBufferIsXidKnown(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	return txn != NULL;
+}
+
+/*
+ * Add a new snapshot to this transaction that is only used after lsn 'lsn'.
+ */
+void
+ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
+						 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->snapshot = snap;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+/*
+ * Setup the base snapshot of a transaction. That is the snapshot that is used
+ * to decode all changes until either this transaction modifies the catalog or
+ * another catalog modifying transaction commits.
+ */
+void
+ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferTXN *txn;
+	bool		is_new;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
+	Assert(txn->base_snapshot == NULL);
+
+	txn->base_snapshot = snap;
+}
+
+/*
+ * Access the catalog with this CommandId at this point in the changestream.
+ *
+ * May only be called for command ids > 1
+ */
+void
+ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, CommandId cid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->command_id = cid;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+
+/*
+ * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
+ */
+void
+ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, RelFileNode node,
+							 ItemPointerData tid, CommandId cmin,
+							 CommandId cmax, CommandId combocid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->tuplecid.node = node;
+	change->tuplecid.tid = tid;
+	change->tuplecid.cmin = cmin;
+	change->tuplecid.cmax = cmax;
+	change->tuplecid.combocid = combocid;
+	change->lsn = lsn;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
+
+	dlist_push_tail(&txn->tuplecids, &change->node);
+	txn->ntuplecids++;
+}
+
+/*
+ * Setup the invalidation of the toplevel transaction.
+ *
+ * This needs to be done before ReorderBufferCommit is called!
+ */
+void
+ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
+							  XLogRecPtr lsn, Size nmsgs,
+							  SharedInvalidationMessage *msgs)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	if (txn->ninvalidations != 0)
+		elog(ERROR, "only ever add one set of invalidations");
+
+	Assert(nmsgs > 0);
+
+	txn->ninvalidations = nmsgs;
+	txn->invalidations = (SharedInvalidationMessage *)
+		MemoryContextAlloc(rb->context,
+						   sizeof(SharedInvalidationMessage) * nmsgs);
+	memcpy(txn->invalidations, msgs, sizeof(SharedInvalidationMessage) * nmsgs);
+}
+
+/*
+ * Apply all invalidations we know. Possibly we only need parts at this point
+ * in the changestream but we don't know which those are.
+ */
+static void
+ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	int			i;
+
+	for (i = 0; i < txn->ninvalidations; i++)
+		LocalExecuteInvalidationMessage(&txn->invalidations[i]);
+}
+
+/*
+ * Mark a transaction as doing timetravel.
+ */
+void
+ReorderBufferXidSetTimetravel(ReorderBuffer *rb, TransactionId xid,
+							  XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	txn->does_timetravel = true;
+}
+
+/*
+ * Query whether a transaction is already *known* to be doing timetravel. This
+ * can be wrong until directly before the commit!
+ */
+bool
+ReorderBufferXidDoesTimetravel(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	if (txn == NULL)
+		return false;
+
+	return txn->does_timetravel;
+}
+
+/*
+ * Have we already added the first snapshot?
+ */
+bool
+ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* transaction isn't known yet, ergo no snapshot */
+	if (txn == NULL)
+		return false;
+
+	return txn->base_snapshot != NULL;
+}
+
+static void
+ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
+{
+	if (!rb->outbufsize)
+	{
+		rb->outbuf = MemoryContextAlloc(rb->context, sz);
+		rb->outbufsize = sz;
+	}
+	else if (rb->outbufsize < sz)
+	{
+		rb->outbuf = repalloc(rb->outbuf, sz);
+		rb->outbufsize = sz;
+	}
+}
+
+typedef struct ReorderBufferDiskChange
+{
+	Size		size;
+	ReorderBufferChange change;
+	/* data follows */
+} ReorderBufferDiskChange;
+
+/*
+ * Persistency support
+ */
+static void
+ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change)
+{
+	ReorderBufferDiskChange *ondisk;
+	Size		sz = sizeof(ReorderBufferDiskChange);
+
+	ReorderBufferSerializeReserve(rb, sz);
+
+	ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+	memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
+
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			{
+				char	   *data;
+				Size		oldlen = 0;
+				Size		newlen = 0;
+
+				if (change->oldtuple)
+					oldlen = offsetof(ReorderBufferTupleBuf, data)
+						+change->oldtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				if (change->newtuple)
+					newlen = offsetof(ReorderBufferTupleBuf, data)
+						+change->newtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				sz += oldlen;
+				sz += newlen;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				if (oldlen)
+				{
+					memcpy(data, change->oldtuple, oldlen);
+					data += oldlen;
+					Assert(&change->oldtuple->header == change->oldtuple->tuple.t_data);
+				}
+
+				if (newlen)
+				{
+					memcpy(data, change->newtuple, newlen);
+					data += newlen;
+					Assert(&change->newtuple->header == change->newtuple->tuple.t_data);
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				char	   *data;
+
+				sz += sizeof(SnapshotData) +
+					sizeof(TransactionId) * change->snapshot->xcnt +
+					sizeof(TransactionId) * change->snapshot->subxcnt
+					;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				memcpy(data, change->snapshot, sizeof(SnapshotData));
+				data += sizeof(SnapshotData);
+
+				if (change->snapshot->xcnt)
+				{
+					memcpy(data, change->snapshot->xip,
+						   sizeof(TransactionId) + change->snapshot->xcnt);
+					data += sizeof(TransactionId) + change->snapshot->xcnt;
+				}
+
+				if (change->snapshot->subxcnt)
+				{
+					memcpy(data, change->snapshot->subxip,
+						   sizeof(TransactionId) + change->snapshot->subxcnt);
+					data += sizeof(TransactionId) + change->snapshot->subxcnt;
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			/* ReorderBufferChange contains everything important */
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			/* ReorderBufferChange contains everything important */
+			break;
+	}
+
+	ondisk->size = sz;
+
+	if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to xid data file \"%u\": %m",
+						txn->xid)));
+	}
+
+	Assert(ondisk->change.action_internal == change->action_internal);
+}
+
+static void
+ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/* FIXME subtxn handling? */
+	if (txn->nentries_mem >= max_memtries)
+	{
+		ReorderBufferSerializeTXN(rb, txn);
+		Assert(txn->nentries_mem == 0);
+	}
+}
+
+static void
+ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	subtxn_i;
+	dlist_mutable_iter change_i;
+	int			fd = -1;
+	XLogSegNo	curOpenSegNo = 0;
+	Size		spilled = 0;
+	char		path[MAXPGPATH];
+
+	elog(DEBUG2, "spill %zu changes in tx %u to disk",
+		 txn->nentries_mem, txn->xid);
+
+	/* do the same to all child TXs */
+	dlist_foreach(subtxn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
+		ReorderBufferSerializeTXN(rb, subtxn);
+	}
+
+	/* serialize changestream */
+	dlist_foreach_modify(change_i, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, change_i.cur);
+
+		/*
+		 * store in segment in which it belongs by start lsn, don't split over
+		 * multiple segments tho
+		 */
+		if (fd == -1 || XLByteInSeg(change->lsn, curOpenSegNo))
+		{
+			XLogRecPtr	recptr;
+
+			if (fd != -1)
+				CloseTransientFile(fd);
+
+			XLByteToSeg(change->lsn, curOpenSegNo);
+			XLogSegNoOffsetToRecPtr(curOpenSegNo, 0, recptr);
+
+			sprintf(path, "pg_llog/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyLogicalDecodingSlot->name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			/* open segment, create it if necessary */
+			fd = OpenTransientFile(path,
+								   O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+								   S_IRUSR | S_IWUSR);
+
+			if (fd < 0)
+				ereport(ERROR, (errmsg("could not open reorderbuffer file %s for writing: %m", path)));
+		}
+
+		ReorderBufferSerializeChange(rb, txn, fd, change);
+		dlist_delete(&change->node);
+		ReorderBufferReturnChange(rb, change);
+
+		spilled++;
+	}
+
+	Assert(spilled == txn->nentries_mem);
+	Assert(dlist_is_empty(&txn->changes));
+	txn->nentries_mem = 0;
+
+	if (fd != -1)
+		CloseTransientFile(fd);
+
+	/* issue write barrier */
+	/* serialize main transaction state */
+}
+
+static Size
+ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno)
+{
+	Size		restored = 0;
+	XLogSegNo	last_segno;
+	dlist_mutable_iter cleanup_iter;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	/* free current entries, so we have memory for more */
+	dlist_foreach_modify(cleanup_iter, &txn->changes)
+	{
+		ReorderBufferChange *cleanup =
+		dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
+
+		dlist_delete(&cleanup->node);
+		ReorderBufferReturnChange(rb, cleanup);
+	}
+	txn->nentries_mem = 0;
+	Assert(dlist_is_empty(&txn->changes));
+
+	XLByteToSeg(txn->final_lsn, last_segno);
+
+	while (restored < max_memtries && *segno <= last_segno)
+	{
+		int			readBytes;
+		ReorderBufferDiskChange *ondisk;
+
+		if (*fd == -1)
+		{
+			XLogRecPtr	recptr;
+			char		path[MAXPGPATH];
+
+			/* first time in */
+			if (*segno == 0)
+			{
+				XLByteToSeg(txn->first_lsn, *segno);
+				elog(LOG, "initial restoring from %zu to %zu",
+					 *segno, last_segno);
+			}
+
+			Assert(*segno != 0 || dlist_is_empty(&txn->changes));
+			XLogSegNoOffsetToRecPtr(*segno, 0, recptr);
+
+			sprintf(path, "pg_llog/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyLogicalDecodingSlot->name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			elog(LOG, "opening file %s", path);
+
+			*fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+			if (*fd < 0 && errno == ENOENT)
+			{
+				*fd = -1;
+				(*segno)++;
+				continue;
+			}
+			else if (*fd < 0)
+				ereport(ERROR, (errmsg("could not open reorderbuffer file %s for reading: %m", path)));
+
+		}
+
+		ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
+
+
+		/*
+		 * read the statically sized part of a change which has information
+		 * about the total size. If we couldn't read a record, we're at the
+		 * end of this file.
+		 */
+
+		readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange));
+
+		/* eof */
+		if (readBytes == 0)
+		{
+			CloseTransientFile(*fd);
+			*fd = -1;
+			(*segno)++;
+			continue;
+		}
+		else if (readBytes < 0)
+			elog(ERROR, "read failed: %m");
+		else if (readBytes != sizeof(ReorderBufferDiskChange))
+			elog(ERROR, "incomplete read, read %d instead of %zu",
+				 readBytes, sizeof(ReorderBufferDiskChange));
+
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		ReorderBufferSerializeReserve(rb,
+									  sizeof(ReorderBufferDiskChange) + ondisk->size);
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange),
+						 ondisk->size - sizeof(ReorderBufferDiskChange));
+
+		if (readBytes < 0)
+			elog(ERROR, "read2 failed: %m");
+		else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
+			elog(ERROR, "incomplete read2, read %d instead of %zu",
+				 readBytes, ondisk->size - sizeof(ReorderBufferDiskChange));
+
+		/*
+		 * ok, read a full change from disk, now restore it into proper
+		 * in-memory format
+		 */
+		ReorderBufferRestoreChange(rb, txn, rb->outbuf);
+		restored++;
+	}
+
+	return restored;
+}
+
+/*
+ * Convert change from its on-disk format to in-memory format and queue it onto
+ * the TXN's ->changes list.
+ */
+static void
+ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *data)
+{
+	ReorderBufferDiskChange *ondisk;
+	ReorderBufferChange *change;
+
+	ondisk = (ReorderBufferDiskChange *) data;
+
+	change = ReorderBufferGetChange(rb);
+
+	/* copy static part */
+	memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
+
+	data += sizeof(ReorderBufferDiskChange);
+
+	/* restore individual stuff */
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->newtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->newtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->newtuple, data, len);
+				change->newtuple->tuple.t_data = &change->newtuple->header;
+
+				data += len;
+			}
+
+			if (change->oldtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->oldtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->oldtuple, data, len);
+				change->oldtuple->tuple.t_data = &change->oldtuple->header;
+				data += len;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				Snapshot	oldsnap = (Snapshot) data;
+				Size		size = sizeof(SnapshotData) +
+				sizeof(TransactionId) * oldsnap->xcnt +
+				sizeof(TransactionId) * (oldsnap->subxcnt + 0)
+						   ;
+
+				Assert(change->snapshot != NULL);
+
+				change->snapshot = MemoryContextAllocZero(rb->context, size);
+
+				memcpy(change->snapshot, data, size);
+				change->snapshot->xip = (TransactionId *)
+					(((char *) change->snapshot) + sizeof(SnapshotData));
+				change->snapshot->subxip =
+					change->snapshot->xip + change->snapshot->xcnt + 0;
+				change->snapshot->copied = true;
+				break;
+			}
+			/* nothing needs to be done */
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries_mem++;
+}
+
+/*
+ * Delete all data spilled to disk after we've restarted/crashed. It will be
+ * recreated when the respective slots are reused.
+ */
+void
+ReorderBufferStartup(void)
+{
+	DIR		   *logical_dir;
+	struct dirent *logical_de;
+
+	DIR		   *spill_dir;
+	struct dirent *spill_de;
+
+	logical_dir = AllocateDir("pg_llog");
+	while ((logical_de = ReadDir(logical_dir, "pg_llog")) != NULL)
+	{
+		char		path[MAXPGPATH];
+
+		if (strcmp(logical_de->d_name, ".") == 0 ||
+			strcmp(logical_de->d_name, "..") == 0)
+			continue;
+
+		/* one of our own directories */
+		if (strcmp(logical_de->d_name, "snapshots") == 0)
+			continue;
+
+		/*
+		 * ok, has to be a surviving logical slot, iterate and delete
+		 * everythign starting with xid-*
+		 */
+		sprintf(path, "pg_llog/%s", logical_de->d_name);
+
+		spill_dir = AllocateDir(path);
+		while ((spill_de = ReadDir(spill_dir, "pg_llog")) != NULL)
+		{
+			if (strcmp(spill_de->d_name, ".") == 0 ||
+				strcmp(spill_de->d_name, "..") == 0)
+				continue;
+
+			if (strncmp(spill_de->d_name, "xid", 3) == 0)
+			{
+				sprintf(path, "pg_llog/%s/%s", logical_de->d_name,
+						spill_de->d_name);
+
+				if (unlink(path) != 0)
+					ereport(PANIC,
+							(errcode_for_file_access(),
+						  errmsg("could not remove xid data file \"%s\": %m",
+								 path)));
+			}
+			/* XXX: WARN? */
+		}
+		FreeDir(spill_dir);
+	}
+	FreeDir(logical_dir);
+}
+
+/*
+ * toast support
+ */
+
+/*
+ * copied stuff from tuptoaster.c. Perhaps there should be toast_internal.h?
+ */
+#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr)	\
+do { \
+	varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
+	Assert(VARATT_IS_EXTERNAL(attre)); \
+	Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
+	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
+} while (0)
+
+#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
+	((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
+
+/*
+ * Initialize per tuple toast reconstruction support.
+ */
+static void
+ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASHCTL		hash_ctl;
+
+	Assert(txn->toast_hash == NULL);
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(Oid);
+	hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+	txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
+								  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Per toast-chunk handling for toast reconstruction
+ *
+ * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
+ * toasted Datum comes along.
+ */
+static void
+ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change)
+{
+	ReorderBufferToastEnt *ent;
+	bool		found;
+	int32		chunksize;
+	bool		isnull;
+	Pointer		chunk;
+	TupleDesc	desc = RelationGetDescr(relation);
+	Oid			chunk_id;
+	Oid			chunk_seq;
+
+	if (txn->toast_hash == NULL)
+		ReorderBufferToastInitHash(rb, txn);
+
+	Assert(IsToastRelation(relation));
+
+	chunk_id = DatumGetObjectId(fastgetattr(&change->newtuple->tuple, 1, desc, &isnull));
+	Assert(!isnull);
+	chunk_seq = DatumGetInt32(fastgetattr(&change->newtuple->tuple, 2, desc, &isnull));
+	Assert(!isnull);
+
+	ent = (ReorderBufferToastEnt *)
+		hash_search(txn->toast_hash,
+					(void *) &chunk_id,
+					HASH_ENTER,
+					&found);
+
+	if (!found)
+	{
+		Assert(ent->chunk_id == chunk_id);
+		ent->num_chunks = 0;
+		ent->last_chunk_seq = 0;
+		ent->size = 0;
+		ent->reconstructed = NULL;
+		dlist_init(&ent->chunks);
+
+		if (chunk_seq != 0)
+			elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
+				 chunk_seq, chunk_id);
+	}
+	else if (found && chunk_seq != ent->last_chunk_seq + 1)
+		elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
+			 chunk_seq, chunk_id, ent->last_chunk_seq + 1);
+
+	chunk = DatumGetPointer(fastgetattr(&change->newtuple->tuple, 3, desc, &isnull));
+	Assert(!isnull);
+
+	/* calculate size so we can allocate the right size at once later */
+	if (!VARATT_IS_EXTENDED(chunk))
+		chunksize = VARSIZE(chunk) - VARHDRSZ;
+	else if (VARATT_IS_SHORT(chunk))
+		/* could happen due to heap_form_tuple doing its thing */
+		chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
+	else
+		elog(ERROR, "unexpected type of toast chunk");
+
+	ent->size += chunksize;
+	ent->last_chunk_seq = chunk_seq;
+	ent->num_chunks++;
+	dlist_push_tail(&ent->chunks, &change->node);
+}
+
+/*
+ * Rejigger change->newtuple to point to in-memory toast tuples instead to
+ * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
+ *
+ * We cannot replace unchanged toast tuples though, so those will still point
+ * to on-disk toast data.
+ */
+static void
+ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change)
+{
+	TupleDesc	desc;
+	int			natt;
+	Datum	   *attrs;
+	bool	   *isnull;
+	bool	   *free;
+	HeapTuple	newtup;
+	Relation	toast_rel;
+	TupleDesc	toast_desc;
+	MemoryContext oldcontext;
+
+	/* no toast tuples changed */
+	if (txn->toast_hash == NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(rb->context);
+
+	/* we should only have toast tuples in an INSERT or UPDATE */
+	Assert(change->newtuple);
+
+	desc = RelationGetDescr(relation);
+
+	toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
+	toast_desc = RelationGetDescr(toast_rel);
+
+	/* should we allocate from stack instead? */
+	attrs = palloc0(sizeof(Datum) * desc->natts);
+	isnull = palloc0(sizeof(bool) * desc->natts);
+	free = palloc0(sizeof(bool) * desc->natts);
+
+	heap_deform_tuple(&change->newtuple->tuple, desc,
+					  attrs, isnull);
+
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute attr = desc->attrs[natt];
+		ReorderBufferToastEnt *ent;
+		struct varlena *varlena;
+
+		/* va_rawsize is the size of the original datum -- including header */
+		struct varatt_external toast_pointer;
+		struct varatt_indirect redirect_pointer;
+		struct varlena *new_datum = NULL;
+		struct varlena *reconstructed;
+		dlist_iter	it;
+		Size		data_done = 0;
+
+		/* system columns aren't toasted */
+		if (attr->attnum < 0)
+			continue;
+
+		if (attr->attisdropped)
+			continue;
+
+		/* not a varlena datatype */
+		if (attr->attlen != -1)
+			continue;
+
+		/* no data */
+		if (isnull[natt])
+			continue;
+
+		/* ok, we know we have a toast datum */
+		varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
+
+		/* no need to do anything if the tuple isn't external */
+		if (!VARATT_IS_EXTERNAL(varlena))
+			continue;
+
+		VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
+
+		/*
+		 * check whether the toast tuple changed, replace if so.
+		 */
+		ent = (ReorderBufferToastEnt *)
+			hash_search(txn->toast_hash,
+						(void *) &toast_pointer.va_valueid,
+						HASH_FIND,
+						NULL);
+		if (ent == NULL)
+			continue;
+
+		new_datum =
+			(struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
+
+		free[natt] = true;
+
+		reconstructed = palloc0(toast_pointer.va_rawsize);
+
+		ent->reconstructed = reconstructed;
+
+		/* stitch toast tuple back together from its parts */
+		dlist_foreach(it, &ent->chunks)
+		{
+			bool		isnull;
+			ReorderBufferTupleBuf *tup =
+			dlist_container(ReorderBufferChange, node, it.cur)->newtuple;
+			Pointer		chunk =
+			DatumGetPointer(fastgetattr(&tup->tuple, 3, toast_desc, &isnull));
+
+			Assert(!isnull);
+			Assert(!VARATT_IS_EXTERNAL(chunk));
+			Assert(!VARATT_IS_SHORT(chunk));
+
+			memcpy(VARDATA(reconstructed) + data_done,
+				   VARDATA(chunk),
+				   VARSIZE(chunk) - VARHDRSZ);
+			data_done += VARSIZE(chunk) - VARHDRSZ;
+		}
+		Assert(data_done == toast_pointer.va_extsize);
+
+		/* make sure its marked as compressed or not */
+		if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
+			SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
+		else
+			SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
+
+		memset(&redirect_pointer, 0, sizeof(redirect_pointer));
+		redirect_pointer.pointer = reconstructed;
+
+		SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
+		memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
+			   sizeof(redirect_pointer));
+
+		attrs[natt] = PointerGetDatum(new_datum);
+	}
+
+	/*
+	 * Build tuple in separate memory & copy tuple back into the tuplebuf
+	 * passed to the output plugin. We can't directly heap_fill_tuple() into
+	 * the tuplebuf because attrs[] will point back into the current content.
+	 */
+	newtup = heap_form_tuple(desc, attrs, isnull);
+	Assert(change->newtuple->tuple.t_len <= MaxHeapTupleSize);
+	Assert(&change->newtuple->header == change->newtuple->tuple.t_data);
+
+	memcpy(change->newtuple->tuple.t_data,
+		   newtup->t_data,
+		   newtup->t_len);
+	change->newtuple->tuple.t_len = newtup->t_len;
+
+	/*
+	 * free resources we won't further need, more persistent stuff will be
+	 * free'd in ReorderBufferToastReset().
+	 */
+	RelationClose(toast_rel);
+	pfree(newtup);
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		if (free[natt])
+			pfree(DatumGetPointer(attrs[natt]));
+	}
+	pfree(attrs);
+	pfree(free);
+	pfree(isnull);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Free all resources allocated for toast reconstruction.
+ */
+static void
+ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASH_SEQ_STATUS hstat;
+	ReorderBufferToastEnt *ent;
+
+	if (txn->toast_hash == NULL)
+		return;
+
+	/* sequentially walk over the hash and free everything */
+	hash_seq_init(&hstat, txn->toast_hash);
+	while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		dlist_mutable_iter it;
+
+		if (ent->reconstructed != NULL)
+			pfree(ent->reconstructed);
+
+		dlist_foreach_modify(it, &ent->chunks)
+		{
+			ReorderBufferChange *change =
+			dlist_container(ReorderBufferChange, node, it.cur);
+
+			dlist_delete(&change->node);
+			ReorderBufferReturnChange(rb, change);
+		}
+	}
+
+	hash_destroy(txn->toast_hash);
+	txn->toast_hash = NULL;
+}
+
+
+/*
+ * Visibility support routines
+ */
+
+/*-------------------------------------------------------------------------
+ * Lookup actual cmin/cmax values during timetravel access. We can't always
+ * rely on stored cmin/cmax values because of two scenarios:
+ *
+ * * A tuple got changed multiple times during a single transaction and thus
+ *	 has got a combocid. Combocid's are only valid for the duration of a single
+ *	 transaction.
+ * * A tuple with a cmin but no cmax (and thus no combocid) got deleted/updated
+ *	 in another transaction than the one which created it which we are looking
+ *	 at right now. As only one of cmin, cmax or combocid is actually stored in
+ *	 the heap we don't have access to the the value we need anymore.
+ *
+ * To resolve those problems we have a per-transaction hash of (cmin, cmax)
+ * tuples keyed by (relfilenode, ctid) which contains the actual (cmin, cmax)
+ * values. That also takes care of combocids by simply not caring about them at
+ * all. As we have the real cmin/cmax values thats enough.
+ *
+ * As we only care about catalog tuples here the overhead of this hashtable
+ * should be acceptable.
+ * -------------------------------------------------------------------------
+ */
+extern bool
+ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
+							  HeapTuple htup, Buffer buffer,
+							  CommandId *cmin, CommandId *cmax)
+{
+	ReorderBufferTupleCidKey key;
+	ReorderBufferTupleCidEnt *ent;
+	ForkNumber	forkno;
+	BlockNumber blockno;
+
+	/* be careful about padding */
+	memset(&key, 0, sizeof(key));
+
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * get relfilenode from the buffer, no convenient way to access it other
+	 * than that.
+	 */
+	BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
+
+	/* tuples can only be in the main fork */
+	Assert(forkno == MAIN_FORKNUM);
+	Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
+
+	ItemPointerCopy(&htup->t_self,
+					&key.tid);
+
+	ent = (ReorderBufferTupleCidEnt *)
+		hash_search(tuplecid_data,
+					(void *) &key,
+					HASH_FIND,
+					NULL);
+
+	if (ent == NULL)
+		return false;
+
+	if (cmin)
+		*cmin = ent->cmin;
+	if (cmax)
+		*cmax = ent->cmax;
+	return true;
+}
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
new file mode 100644
index 0000000..6547e3f
--- /dev/null
+++ b/src/backend/replication/logical/snapbuild.c
@@ -0,0 +1,1581 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.c
+ *
+ *	  Support for building timetravel snapshots based on the contents of the
+ *	  WAL which then can be used to decode the contents of the WAL.
+ *
+ * NOTES:
+ *
+ * We build snapshots which can *only* be used to read catalog contents by
+ * reading and interpreting the WAL stream. The aim is to build a snapshot that
+ * behaves the same as a freshly taken MVCC snapshot would have at the time the
+ * XLogRecord was generated.
+ *
+ * To build the snapshots we reuse the infrastructure built for hot
+ * standby. The snapshots we build look different than HS' because we have
+ * different needs. To successfully decode data from the WAL we only need to
+ * access catalogs/(sys|rel|cat)cache, not the actual user tables since the
+ * data we decode is contained in the WAL records. Also, our snapshots need to
+ * be different in comparison to normal MVCC ones because in contrast to those
+ * we cannot fully rely on the clog and pg_subtrans for information about
+ * committed transactions because they might commit in the future from the POV
+ * of the wal entry we're currently decoding.
+ *
+ * As the percentage of transactions modifying the catalog normally is fairly
+ * small in comparisons to ones only manipulating user data we keep track of
+ * the committed catalog modifying ones inside (xmin, xmax) instead of keeping
+ * track of all running transactions like its done in a normal snapshot. Note
+ * that we're generally only looking at transactions that have acquired an
+ * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
+ * that we consider committed, everything else is considered aborted/in
+ * progress. That also allows us not to care about subtransactions before they
+ * have committed which means this modules, in contrast to HS, doesn't have to
+ * care about suboverflowed subtransactions and similar.
+ *
+ * One complexity of doing this is that to e.g. handle mixed DDL/DML
+ * transactions we need Snapshots that see intermediate versions of the catalog
+ * in a transaction. During normal operation this is achieved by using
+ * CommandIds/cmin/cmax. The problem with that however is that for space
+ * efficiency reasons only one value of that is stored (c.f. combocid.c). Since
+ * Combocids are only available in memory we log additional information which
+ * allows us to get the original (cmin, cmax) pair during visibility
+ * checks. Check the reorderbuffer.c's comment above
+ * ResolveCminCmaxDuringDecoding() for details.
+ *
+ * To facilitate all this we need our own visibility routine, as the normal
+ * ones are optimized for different usecases. To make sure no unexpected
+ * database access bypassing our special snapshot is possible - which would
+ * possibly load invalid data into caches - we temporarily overload the
+ * .satisfies methods of the usual snapshots while doing timetravel.
+ *
+ * To replace the normal catalog snapshots with timetravel ones use the
+ * SetupDecodingSnapshots and RevertFromDecodingSnapshots functions.
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/snapbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "utils/builtins.h"
+#include "utils/catcache.h" /* FIXME: Use */
+#include "utils/memutils.h"
+#include "utils/snapshot.h"
+#include "utils/snapmgr.h"
+#include "utils/tqual.h"
+
+#include "storage/block.h"		/* debugging output */
+#include "storage/fd.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+
+typedef struct SnapBuild
+{
+	/* how far are we along building our first full snapshot */
+	SnapBuildState state;
+
+	/* private memory context used to allocate memory for this module. */
+	MemoryContext context;
+
+	/* all transactions < than this have committed/aborted */
+	TransactionId xmin;
+
+	/* all transactions >= than this are uncommitted */
+	TransactionId xmax;
+
+	/*
+	 * Don't replay commits from an LSN <= this LSN. This can be set
+	 * externally but it will also be advanced (never retreat) from within
+	 * snapbuild.c.
+	 */
+	XLogRecPtr	transactions_after;
+
+	/*
+	 * Don't start decoding WAL until the "xl_running_xacts" information
+	 * indicates there are no running xids with a xid smaller than this.
+	 */
+	TransactionId initial_xmin_horizon;
+
+	/*
+	 * Snapshot thats valid to see all currently committed transactions that
+	 * see catalog modifications.
+	 */
+	Snapshot	snapshot;
+
+	/*
+	 * LSN of the last location we are sure a snapshot has been serialized to.
+	 */
+	XLogRecPtr	last_serialized_snapshot;
+
+	ReorderBuffer *reorder;
+
+	/*
+	 * Information about initially running transactions
+	 *
+	 * When we start building a snapshot there already may be transactions in
+	 * progress.  Those are stored in running.xip.	We don't have enough
+	 * information about those to decode their contents, so until they are
+	 * finished (xcnt=0) we cannot switch to a CONSISTENT state.
+	 */
+	struct
+	{
+		/*
+		 * As long as running.xcnt all XIDs < running.xmin and > running.xmax
+		 * have to be checked whether they still are running.
+		 */
+		TransactionId xmin;
+		TransactionId xmax;
+
+		size_t		xcnt;		/* number of used xip entries */
+		size_t		xcnt_space; /* allocated size of xip */
+		TransactionId *xip;		/* running xacts array, xidComparator-sorted */
+	}			running;
+
+	/*
+	 * Array of transactions which could have catalog changes that committed
+	 * between xmin and xmax
+	 */
+	struct
+	{
+		/* number of committed transactions */
+		size_t		xcnt;
+
+		/* available space for committed transactions */
+		size_t		xcnt_space;
+
+		/*
+		 * Until we reach a CONSISTENT state, we record commits of all
+		 * transactions, not just the catalog changing ones. Record when that
+		 * changes so we know we cannot export a snapshot safely anymore.
+		 */
+		bool		includes_all_transactions;
+
+		/*
+		 * Array of committed transactions that have modified the catalog.
+		 *
+		 * As this array is frequently modified we do *not* keep it in
+		 * xidComparator order. Instead we sort the array when building &
+		 * distributing a snapshot.
+		 *
+		 * XXX: That doesn't seem to be good reasoning anymore. Everytime we
+		 * add something here after becoming consistent will also require
+		 * distributing a snapshot. Storing them sorted would potentially make
+		 * it easier to purge as well (but more complicated wrt wraparound?).
+		 */
+		TransactionId *xip;
+	}			committed;
+} SnapBuild;
+
+/*
+ * Starting a transaction -- which we need to do while exporting a snapshot --
+ * removes knowledge about the previously used resowner, so we save it here.
+ */
+ResourceOwner SavedResourceOwnerDuringExport = NULL;
+
+/* transaction state manipulation functions */
+static void SnapBuildEndTxn(SnapBuild *builder, TransactionId xid);
+
+/* ->running manipulation */
+static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
+
+/* ->committed manipulation */
+static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
+
+/* snapshot building/manipulation/distribution functions */
+static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
+
+static void SnapBuildFreeSnapshot(Snapshot snap);
+
+static void SnapBuildSnapIncRefcount(Snapshot snap);
+
+static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
+
+/* xlog reading helper functions for SnapBuildProcessRecord */
+static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
+
+/* serialization functions */
+static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
+static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
+
+
+/*
+ * Allocate a new snapshot builder.
+ */
+SnapBuild *
+AllocateSnapshotBuilder(ReorderBuffer *reorder,
+						TransactionId xmin_horizon,
+						XLogRecPtr start_lsn)
+{
+	MemoryContext context;
+	MemoryContext oldcontext;
+	SnapBuild  *builder;
+
+	context = AllocSetContextCreate(TopMemoryContext,
+									"snapshot builder context",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	oldcontext = MemoryContextSwitchTo(context);
+
+	builder = palloc0(sizeof(SnapBuild));
+
+	builder->state = SNAPBUILD_START;
+	builder->context = context;
+	builder->reorder = reorder;
+	/* Other struct members initialized by zeroing, above */
+
+	/* builder->running is initialized by zeroing, above */
+
+	builder->committed.xcnt = 0;
+	builder->committed.xcnt_space = 128;		/* arbitrary number */
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->committed.includes_all_transactions = true;
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->initial_xmin_horizon = xmin_horizon;
+	builder->transactions_after = start_lsn;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return builder;
+}
+
+/*
+ * Free a snapshot builder.
+ */
+void
+FreeSnapshotBuilder(SnapBuild *builder)
+{
+	MemoryContext context = builder->context;
+
+	if (builder->snapshot)
+		SnapBuildFreeSnapshot(builder->snapshot);
+
+	if (builder->running.xip)
+		pfree(builder->running.xip);
+
+	if (builder->committed.xip)
+		pfree(builder->committed.xip);
+
+	pfree(builder);
+
+	MemoryContextDelete(context);
+}
+
+/*
+ * Free an unreferenced snapshot that has previously been built by us.
+ */
+static void
+SnapBuildFreeSnapshot(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesMVCCDuringDecoding);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+	Assert(!snap->regd_count);
+
+	/* slightly more likely, so it's checked even without c-asserts */
+	if (snap->copied)
+		elog(ERROR, "can't free a copied snapshot");
+
+	if (snap->active_count)
+		elog(ERROR, "can't free an active snapshot");
+
+	pfree(snap);
+}
+
+/*
+ * In which state of snapshot building ar we?
+ */
+SnapBuildState
+SnapBuildCurrentState(SnapBuild *builder)
+{
+	return builder->state;
+}
+
+/*
+ * Should the contents of transaction ending at 'ptr' be decoded?
+ */
+bool
+SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
+{
+	return ptr <= builder->transactions_after;
+}
+
+/*
+ * Increase refcount of a snapshot.
+ *
+ * This is used when handing out a snapshot to some external resource or when
+ * adding a Snapshot as builder->snapshot.
+ */
+static void
+SnapBuildSnapIncRefcount(Snapshot snap)
+{
+	snap->active_count++;
+}
+
+/*
+ * Decrease refcount of a snapshot and free if the refcount reaches zero.
+ *
+ * Externally visible so external resources that have been handed an IncRef'ed
+ * Snapshot can free it easily.
+ */
+void
+SnapBuildSnapDecRefcount(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesMVCCDuringDecoding);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+	Assert(!snap->regd_count);
+
+	Assert(snap->active_count);
+
+	/* slightly more likely, so its checked even without casserts */
+	if (snap->copied)
+		elog(ERROR, "can't free a copied snapshot");
+
+	snap->active_count--;
+	if (!snap->active_count)
+		SnapBuildFreeSnapshot(snap);
+}
+
+/*
+ * Build a new snapshot, based on currently committed catalog-modifying
+ * transactions.
+ *
+ * In-progress transactions with catalog access are *not* allowed to modify
+ * these snapshots; they have to copy them and fill in appropriate ->curcid and
+ * ->subxip/subxcnt values.
+ */
+static Snapshot
+SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
+{
+	Snapshot	snapshot;
+	Size		ssize;
+
+	Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
+
+	ssize = sizeof(SnapshotData)
+		+ sizeof(TransactionId) * builder->committed.xcnt
+		+ sizeof(TransactionId) * 1 /* toplevel xid */ ;
+
+	snapshot = MemoryContextAllocZero(builder->context, ssize);
+
+	snapshot->satisfies = HeapTupleSatisfiesMVCCDuringDecoding;
+
+	/*
+	 * We misuse the original meaning of SnapshotData's xip and subxip fields
+	 * to make the more fitting for our needs.
+	 *
+	 * In the 'xip' array we store transactions that have to be treated as
+	 * committed. Since we will only ever look at tuples from transactions
+	 * that have modified the catalog its more efficient to store those few
+	 * that exist between xmin and xmax (frequently there are none).
+	 *
+	 * Snapshots that are used in transactions that have modified the catalog
+	 * also use the 'subxip' array to store their toplevel xid and all the
+	 * subtransaction xids so we can recognize when we need to treat rows as
+	 * visible that are not in xip but still need to be visible. Subxip only
+	 * gets filled when the transaction is copied into the context of a
+	 * catalog modifying transaction since we otherwise share a snapshot
+	 * between transactions. As long as a txn hasn't modified the catalog it
+	 * doesn't need to treat any uncommitted rows as visible, so there is no
+	 * need for those xids.
+	 *
+	 * Both arrays are qsort'ed so that we can use bsearch() on them.
+	 *
+	 * XXX: Do we want extra fields instead of misusing existing ones instead?
+	 */
+	Assert(TransactionIdIsNormal(builder->xmin));
+	Assert(TransactionIdIsNormal(builder->xmax));
+
+	snapshot->xmin = builder->xmin;
+	snapshot->xmax = builder->xmax;
+
+	/* store all transactions to be treated as committed by this snapshot */
+	snapshot->xip =
+		(TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
+	snapshot->xcnt = builder->committed.xcnt;
+	memcpy(snapshot->xip,
+		   builder->committed.xip,
+		   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* sort so we can bsearch() */
+	qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+
+	/*
+	 * Initially, subxip is empty, i.e. it's a snapshot to be used by
+	 * transactions that don't modify the catalog. Will be filled by
+	 * ReorderBufferCopySnap() if necessary.
+	 */
+	snapshot->subxcnt = 0;
+	snapshot->subxip = NULL;
+
+	snapshot->suboverflowed = false;
+	snapshot->takenDuringRecovery = false;
+	snapshot->copied = false;
+	snapshot->curcid = FirstCommandId;
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+
+	return snapshot;
+}
+
+/*
+ * Export a snapshot so it can be set in another session with SET TRANSACTION
+ * SNAPSHOT.
+ *
+ * For that we need to start a transaction in the current backend as the
+ * importing side checks whether the source transaction is still open to make
+ * sure the xmin horizon hasn't advanced since then.
+ *
+ * After that we convert a locally built snapshot into the normal variant
+ * understood by HeapTupleSatisfiesMVCC et al.
+ */
+const char *
+SnapBuildExportSnapshot(SnapBuild *builder)
+{
+	Snapshot	snap;
+	char	   *snapname;
+	TransactionId xid;
+	TransactionId *newxip;
+	int			newxcnt = 0;
+
+	elog(LOG, "building snapshot");
+
+	if (builder->state != SNAPBUILD_CONSISTENT)
+		elog(ERROR, "cannot export a snapshot before reaching a consistent state");
+
+	if (!builder->committed.includes_all_transactions)
+		elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
+
+	/* so we don't overwrite the existing value */
+	if (TransactionIdIsValid(MyPgXact->xmin))
+		elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
+
+	if (IsTransactionOrTransactionBlock())
+		elog(ERROR, "cannot export a snapshot from within a transaction");
+
+	if (SavedResourceOwnerDuringExport)
+		elog(ERROR, "can only export one snapshot at a time");
+
+	SavedResourceOwnerDuringExport = CurrentResourceOwner;
+
+	StartTransactionCommand();
+
+	Assert(!FirstSnapshotSet);
+
+	/* There doesn't seem to a nice API to set these */
+	XactIsoLevel = XACT_REPEATABLE_READ;
+	XactReadOnly = true;
+
+	snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
+
+	/*
+	 * We know that snap->xmin is alive, enforced by the logical xmin
+	 * mechanism. Due to that we can do this without locks, we're only
+	 * changing our own value.
+	 */
+	MyPgXact->xmin = snap->xmin;
+
+	/* allocate in transaction context */
+	newxip = (TransactionId *)
+		palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
+
+	/*
+	 * snapbuild.c builds transactions in an "inverted" manner, which means it
+	 * stores committed transactions in ->xip, not ones in progress. Build a
+	 * classical snapshot by marking all non-committed transactions as
+	 * in-progress. This can be expensive.
+	 */
+	for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
+	{
+		void	   *test;
+
+		/*
+		 * check whether transaction committed using the timetravel meaning of
+		 * ->xip
+		 */
+		test = bsearch(&xid, snap->xip, snap->xcnt,
+					   sizeof(TransactionId), xidComparator);
+
+		elog(DEBUG2, "checking xid %u.. %d (xmin %u, xmax %u)",
+			 xid, test == NULL, snap->xmin, snap->xmax);
+
+		if (test == NULL)
+		{
+			if (newxcnt >= GetMaxSnapshotXidCount())
+				elog(ERROR, "snapshot too large");
+
+			newxip[newxcnt++] = xid;
+
+			elog(DEBUG2, "treat %u as in-progress", xid);
+		}
+
+		TransactionIdAdvance(xid);
+	}
+
+	snap->xcnt = newxcnt;
+	snap->xip = newxip;
+
+	/*
+	 * now that we've built a plain snapshot, use the normal mechanisms for
+	 * exporting it
+	 */
+	snapname = ExportSnapshot(snap);
+
+	elog(LOG, "exported snapbuild snapshot: %s xcnt %u", snapname, snap->xcnt);
+	return snapname;
+}
+
+/*
+ * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
+ * any. Aborts the previously started transaction and resets the resource owner
+ * back to it's original value.
+ */
+void
+SnapBuildClearExportedSnapshot()
+{
+	/* nothing exported, thats the usual case */
+	if (SavedResourceOwnerDuringExport == NULL)
+		return;
+
+	Assert(IsTransactionState());
+
+	/* make sure nothing  could have ever happened */
+	AbortCurrentTransaction();
+
+	CurrentResourceOwner = SavedResourceOwnerDuringExport;
+	SavedResourceOwnerDuringExport = NULL;
+}
+
+/*
+ * Handle the effects of a single heap change, appropriate to the current state
+ * of the snapshot builder and returns whether changes made at (xid, lsn) may
+ * be decoded.
+ */
+bool
+SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
+{
+	bool is_old_tx;
+
+	/*
+	 * We can't handle data in transactions if we haven't built a snapshot
+	 * yet, so don't store them.
+	 */
+	if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+		return false;
+
+	/*
+	 * No point in keeping track of changes in transactions that we don't have
+	 * enough information about to decode.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT &&
+		SnapBuildTxnIsRunning(builder, xid))
+		return false;
+
+	is_old_tx = ReorderBufferIsXidKnown(builder->reorder, xid);
+
+	if (!is_old_tx || !ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+	{
+		/* only build a new snapshot if we don't have a prebuilt one */
+		if (builder->snapshot == NULL)
+		{
+			builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+			/* inrease refcount for the snapshot builder */
+			SnapBuildSnapIncRefcount(builder->snapshot);
+		}
+
+		/* increase refcount for the transaction */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+									 builder->snapshot);
+	}
+
+	return true;
+}
+
+/*
+ * Do CommandId/ComboCid handling after reading a xl_heap_new_cid record. This
+ * implies that a transaction has done some for of write to system catalogs.
+ */
+void
+SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+					   XLogRecPtr lsn, xl_heap_new_cid *xlrec)
+{
+	CommandId	cid;
+
+	/*
+	 * we only log new_cid's if a catalog tuple was modified, so
+	 * set transaction to timetravelling.
+	 */
+	ReorderBufferXidSetTimetravel(builder->reorder, xid,lsn);
+
+	ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
+								 xlrec->target.node, xlrec->target.tid,
+								 xlrec->cmin, xlrec->cmax,
+								 xlrec->combocid);
+
+	/* figure out new command id */
+	if (xlrec->cmin != InvalidCommandId &&
+		xlrec->cmax != InvalidCommandId)
+		cid = Max(xlrec->cmin, xlrec->cmax);
+	else if (xlrec->cmax != InvalidCommandId)
+		cid = xlrec->cmax;
+	else if (xlrec->cmin != InvalidCommandId)
+		cid = xlrec->cmin;
+	else
+	{
+		cid = InvalidCommandId;		/* silence compiler */
+		elog(ERROR, "broken arrow, no cid?");
+	}
+
+	/*
+	 * FIXME: potential race condition here: if multiple snapshots were running
+	 * & generating changes in the same transaction on the source side this
+	 * could be problematic. But this cannot happen for system catalogs, right?
+	 */
+	ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
+}
+
+/*
+ * Check whether `xid` is currently 'running'. Running transactions in our
+ * parlance are transactions which we didn't observe from the start so we can't
+ * properly decode them. They only exist after we freshly started from an
+ * < CONSISTENT snapshot.
+ */
+static bool
+SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
+{
+	Assert(builder->state < SNAPBUILD_CONSISTENT);
+	Assert(TransactionIdIsValid(builder->running.xmin));
+	Assert(TransactionIdIsValid(builder->running.xmax));
+
+	if (builder->running.xcnt &&
+		NormalTransactionIdFollows(xid, builder->running.xmin) &&
+		NormalTransactionIdPrecedes(xid, builder->running.xmax))
+	{
+		TransactionId *search =
+		bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
+				sizeof(TransactionId), xidComparator);
+
+		if (search != NULL)
+		{
+			Assert(*search == xid);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Add a new Snapshot to all transactions we're decoding that currently are
+ * in-progress so they can see new catalog contents made by the transaction
+ * that just committed. This is necessary because those in-progress
+ * transactions will use the new catalog's contents from here on (at the very
+ * least everything they do needs to be compatible with newer catalog contents).
+ */
+static void
+SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
+{
+	dlist_iter	txn_i;
+	ReorderBufferTXN *txn;
+
+	/*
+	 * Iterate through all toplevel transactions. This can include
+	 * subtransactions which we just don't yet know to be that, but that's
+	 * fine, they will just get an unneccesary snapshot queued.
+	 */
+	dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
+	{
+		txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
+
+		Assert(TransactionIdIsValid(txn->xid));
+
+		/*
+		 * If we don't have a base snapshot yet, there are no changes in this
+		 * transaction which in turn implies we don't yet need a snapshot at
+		 * all. We'll add add a snapshot when the first change gets queued.
+		 *
+		 * XXX: is that fine if only a subtransaction has a base snapshot so
+		 * far?
+		 */
+		if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
+			continue;
+
+		elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
+			 txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
+
+		/* increase refcount for the transaction */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
+								 builder->snapshot);
+	}
+}
+
+/*
+ * Keep track of a new catalog changing transaction that has committed.
+ */
+static void
+SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	if (builder->committed.xcnt == builder->committed.xcnt_space)
+	{
+		builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
+		/* XXX: put in a limit here as a defense against bugs? */
+
+		elog(DEBUG1, "increasing space for committed transactions to %zu",
+			 builder->committed.xcnt_space);
+
+		builder->committed.xip = repalloc(builder->committed.xip,
+					builder->committed.xcnt_space * sizeof(TransactionId));
+	}
+
+	/*
+	 * XXX: It might make sense to keep the array sorted here instead of doing
+	 * it everytime we build a new snapshot. On the other hand this gets called
+	 * repeatedly when a transaction with subtransactions commits.
+	 */
+	builder->committed.xip[builder->committed.xcnt++] = xid;
+}
+
+/*
+ * Remove knowledge about transactions we treat as committed that are smaller
+ * than ->xmin. Those won't ever get checked via the ->commited array but via
+ * the clog machinery, so we don't need to waste memory on them.
+ */
+static void
+SnapBuildPurgeCommittedTxn(SnapBuild *builder)
+{
+	int			off;
+	TransactionId *workspace;
+	int			surviving_xids = 0;
+
+	/* not ready yet */
+	if (!TransactionIdIsNormal(builder->xmin))
+		return;
+
+	/* XXX: Neater algorithm? */
+	workspace =
+		MemoryContextAlloc(builder->context,
+						   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* copy xids that still are interesting to workspace */
+	for (off = 0; off < builder->committed.xcnt; off++)
+	{
+		if (NormalTransactionIdPrecedes(builder->committed.xip[off],
+										builder->xmin))
+			;					/* remove */
+		else
+			workspace[surviving_xids++] = builder->committed.xip[off];
+	}
+
+	/* copy workspace back to persistent state */
+	memcpy(builder->committed.xip, workspace,
+		   surviving_xids * sizeof(TransactionId));
+
+	elog(DEBUG1, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+		 (uint32) builder->committed.xcnt, (uint32) surviving_xids,
+		 builder->xmin, builder->xmax);
+	builder->committed.xcnt = surviving_xids;
+
+	pfree(workspace);
+}
+
+/*
+ * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
+ * keeping track of the amount of running transactions.
+ */
+static void
+SnapBuildEndTxn(SnapBuild *builder, TransactionId xid)
+{
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return;
+
+	if (SnapBuildTxnIsRunning(builder, xid))
+	{
+		Assert(builder->running.xcnt > 0);
+
+		if (!--builder->running.xcnt)
+		{
+			/*
+			 * None of the originally running transaction is running anymore.
+			 * Due to that our incrementaly built snapshot now is complete.
+			 */
+			elog(LOG, "found consistent point due to SnapBuildEndTxn + running: %u", xid);
+			builder->state = SNAPBUILD_CONSISTENT;
+		}
+	}
+}
+
+/*
+ * Abort a transaction, throw away all state we kept
+ */
+void
+SnapBuildAbortTxn(SnapBuild *builder, TransactionId xid,
+				  int nsubxacts, TransactionId *subxacts)
+{
+	int			i;
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		TransactionId subxid = subxacts[i];
+
+		SnapBuildEndTxn(builder, subxid);
+	}
+
+	SnapBuildEndTxn(builder, xid);
+}
+
+/*
+ * Handle everything that needs to be done when a transaction commits
+ */
+void
+SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
+				   int nsubxacts, TransactionId *subxacts)
+{
+	int			nxact;
+
+	bool		forced_timetravel = false;
+	bool		sub_does_timetravel = false;
+	bool		top_does_timetravel = false;
+
+	TransactionId xmax = xid;
+
+	/*
+	 * If we couldn't observe every change of a transaction because it was
+	 * already running at the point we started to observe we have to assume it
+	 * made catalog changes.
+	 *
+	 * This has the positive benefit that we afterwards have enough
+	 * information to build an exportable snapshot thats usable by pg_dump et
+	 * al.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* ensure that only commits after this are getting replayed */
+		if (builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		/*
+		 * we could avoid treating !SnapBuildTxnIsRunning transactions as
+		 * timetravel ones, but we want to be able to export a snapshot when
+		 * we reached consistency.
+		 */
+		forced_timetravel = true;
+		elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running to early", xid);
+	}
+
+	for (nxact = 0; nxact < nsubxacts; nxact++)
+	{
+		TransactionId subxid = subxacts[nxact];
+
+		/*
+		 * make sure txn is not tracked in running txn's anymore, switch state
+		 */
+		SnapBuildEndTxn(builder, subxid);
+
+		/*
+		 * If we're forcing timetravel we also need accurate subtransaction
+		 * status.
+		 */
+		if (forced_timetravel)
+		{
+			SnapBuildAddCommittedTxn(builder, subxid);
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+
+		/*
+		 * add subtransaction to base snapshot, we don't distinguish to
+		 * toplevel transactions there.
+		 */
+		else if (ReorderBufferXidDoesTimetravel(builder->reorder, subxid))
+		{
+			sub_does_timetravel = true;
+
+			elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
+				 xid, subxid);
+
+			SnapBuildAddCommittedTxn(builder, subxid);
+
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+	}
+
+	/*
+	 * make sure txn is not tracked in running txn's anymore, switch state
+	 */
+	SnapBuildEndTxn(builder, xid);
+
+	if (forced_timetravel)
+	{
+		elog(DEBUG1, "forced transaction %u to do timetravel.", xid);
+
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	/* add toplevel transaction to base snapshot */
+	else if (ReorderBufferXidDoesTimetravel(builder->reorder, xid))
+	{
+		elog(DEBUG1, "found top level transaction %u, with catalog changes!",
+			 xid);
+
+		top_does_timetravel = true;
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	else if (sub_does_timetravel)
+	{
+		/* mark toplevel txn as timetravel as well */
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+
+	if (forced_timetravel || top_does_timetravel || sub_does_timetravel)
+	{
+		if (!TransactionIdIsValid(builder->xmax) ||
+			TransactionIdFollowsOrEquals(xmax, builder->xmax))
+		{
+			builder->xmax = xmax;
+			TransactionIdAdvance(builder->xmax);
+		}
+
+		if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+			return;
+
+		/* decrease the snapshot builder's refcount of the old snapshot */
+		if (builder->snapshot)
+			SnapBuildSnapDecRefcount(builder->snapshot);
+
+		builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+
+		/* refcount of the snapshot builder for the new snapshot */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+
+		/* add a new SnapshotNow to all currently running transactions */
+		SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
+	}
+	else
+	{
+		/* record that we cannot export a general snapshot anymore */
+		builder->committed.includes_all_transactions = false;
+	}
+}
+
+
+/* -----------------------------------
+ * Snapshot building functions dealing with xlog records
+ * -----------------------------------
+ */
+void
+SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	ReorderBufferTXN *txn;
+
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* returns false if there's no point in performing cleanup just yet */
+		if (!SnapBuildFindSnapshot(builder, lsn, running))
+			return;
+	}
+	else
+	{
+		SnapBuildSerialize(builder, lsn);
+	}
+
+	/*
+	 * update range of interesting xids. We don't increase ->xmax because once
+	 * we are in a consistent state we can do that ourselves and much more
+	 * efficiently so because we only need to do it for catalog transactions.
+	 */
+	builder->xmin = running->oldestRunningXid;
+
+	/*
+	 * xmax can be lower than xmin here because we only increase xmax when we
+	 * hit a transaction with catalog changes. While odd looking, its correct
+	 * and actually more efficient this way since we hit fast paths in tqual.c.
+	 */
+
+	/* Remove transactions we don't need to keep track off anymore */
+	SnapBuildPurgeCommittedTxn(builder);
+
+	elog(DEBUG1, "xmin: %u, xmax: %u, oldestrunning: %u",
+		 builder->xmin, builder->xmax,
+		 running->oldestRunningXid);
+
+	/*
+	 * inrease shared memory state, so vacuum can work on tuples we prevent
+	 * from being pruned till now.
+	 */
+	IncreaseLogicalXminForSlot(lsn, running->oldestRunningXid);
+
+	/*
+	 * Also tell the slot where we can restart decoding from. We don't want to
+	 * do that after every commit because changing that implies an fsync of the
+	 * logical slot's state file, so we only do it everytime we see a running
+	 * xacts record.
+	 *
+	 * Do so by looking for the oldest in progress transaction (determined by
+	 * the first LSN of any of its relevant records). Every transaction
+	 * remembers the last location we stored the snapshot to disk before its
+	 * beginning. That point is where we can restart from.
+	 */
+
+	/*
+	 * Can't know about a serialized snapshot's location if we're not
+	 * consistent
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	txn = ReorderBufferGetOldestTXN(builder->reorder);
+
+	/*
+	 * oldest ongoing txn might have started when we didn't yet serialize
+	 * anything because we hadn't reached a consistent state yet.
+	 */
+	if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
+		IncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
+
+	/*
+	 * No in-progress transaction, can reuse the last serialized snapshot if we
+	 * have one.
+	 */
+	else if (txn == NULL &&
+			 builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
+			 builder->last_serialized_snapshot != InvalidXLogRecPtr)
+		IncreaseRestartDecodingForSlot(lsn, builder->last_serialized_snapshot);
+}
+
+
+/*
+ * Build the start of a snapshot that's capable of decoding the catalog. Helper
+ * function for SnapBuildProcessRunningXacts() while we're not yet consistent.
+ *
+ * Returns true if there is a point in performing internal maintenance/cleanup
+ * using the xl_running_xacts record.
+ */
+static bool
+SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	/* ---
+	 * Build catalog decoding snapshot incrementally using information about
+	 * the currently running transactions. There are several ways to do that:
+
+	 * a) There were no running transactions when the xl_running_xacts record
+	 *    was inserted, jump to CONSISTENT immediately. We might find such a
+	 *    state we were waiting for b) and c).
+
+	 * b) Wait for all toplevel transactions that were running to end. We
+	 *    simply track the number of in-progress toplevel transactions and
+	 *    lower it whenever one commits or aborts. When that number
+	 *    (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT to
+	 *    CONSISTENT.
+	 *	  NB: We need to search running.xip when seeing a transaction's end to
+	 *    make sure it's a toplevel transaction and it's been one of the
+	 *    intially running ones.
+	 *	  Interestingly, in contrast to HS this allows us not to care about
+	 *	  subtransactions - and by extension suboverflowed xl_running_xacts -
+	 *	  at all.
+	 *
+	 * c) This (in a previous run) or another decoding slot serialized a
+	 *    snapshot to disk that we can use.
+	 * ---
+	 */
+
+	/*
+	 * xl_running_xact record is older than what we can use, we might not have
+	 * all necessary catalog rows anymore.
+	 */
+	if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
+		NormalTransactionIdPrecedes(running->oldestRunningXid,
+									builder->initial_xmin_horizon))
+	{
+		elog(LOG, "skipping snapshot at %X/%X due to initial xmin horizon of %u vs the snapshot's %u",
+			 (uint32) (lsn >> 32), (uint32) lsn,
+			 builder->initial_xmin_horizon, running->oldestRunningXid);
+		return true;
+	}
+
+	/*
+	 * a) No transaction were running, we can jump to consistent.
+	 *
+	 * NB: We might have already started to incrementally assemble a snapshot,
+	 * so we need to be careful to deal with that.
+	 */
+	if (running->xcnt == 0)
+	{
+		if (builder->transactions_after == InvalidXLogRecPtr ||
+			builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		/* no transactions running now */
+		builder->running.xcnt = 0;
+		builder->running.xmin = InvalidTransactionId;
+		builder->running.xmax = InvalidTransactionId;
+
+		/*
+		 * FIXME: abort everything we have stored about running transactions,
+		 * relevant e.g. after a crash.
+		 */
+		builder->state = SNAPBUILD_CONSISTENT;
+
+		elog(LOG, "found initial snapshot (xmin %u) due to running xacts with xcnt == 0",
+			 builder->xmin);
+
+		return false;
+	}
+	/* c) valid on disk state */
+	else if (SnapBuildRestore(builder, lsn))
+	{
+		/* there won't be any state to cleanup */
+		return false;
+	}
+
+	/*
+	 * b) first encounter of a useable xl_running_xacts record. If we had found
+	 * one earlier we would either track running transactions
+	 * (i.e. builder->running.xcnt != 0) or be consistent (this function
+	 * wouldn't get called)..
+	 */
+	else if (!builder->running.xcnt)
+	{
+		/*
+		 * We only care about toplevel xids as those are the ones we definitely
+		 * see in the wal stream. As snapbuild.c tracks committed instead of
+		 * running transactions we don't need to know anything about
+		 * uncommitted subtransactions.
+		 */
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		/* so we can safely use the faster comparisons */
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		builder->running.xcnt = running->xcnt;
+		builder->running.xcnt_space = running->xcnt;
+		builder->running.xip =
+			MemoryContextAlloc(builder->context,
+							builder->running.xcnt * sizeof(TransactionId));
+		memcpy(builder->running.xip, running->xids,
+			   builder->running.xcnt * sizeof(TransactionId));
+
+		/* sort so we can do a binary search */
+		qsort(builder->running.xip, builder->running.xcnt,
+			  sizeof(TransactionId), xidComparator);
+
+		builder->running.xmin = builder->running.xip[0];
+		builder->running.xmax = builder->running.xip[running->xcnt - 1];
+
+		/* makes comparisons cheaper later */
+		TransactionIdRetreat(builder->running.xmin);
+		TransactionIdAdvance(builder->running.xmax);
+
+		builder->state = SNAPBUILD_FULL_SNAPSHOT;
+
+		elog(LOG, "found initial snapshot (xmin %u) due to running xacts, %u xacts need to finish",
+			 builder->xmin, (uint32) builder->running.xcnt);
+
+		/* nothing could have built up so far */
+		return false;
+	}
+
+	/*
+	 * We already started to track running xacts and need to wait for all
+	 * in-progress ones to finish. We fall through to the normal processing of
+	 * records so incremental cleanup can be performed.
+	 */
+	return true;
+}
+
+
+/* -----------------------------------
+ * Snapshot serialization support
+ * -----------------------------------
+ */
+
+/*
+ * We store current state of struct SnapBuild on disk in the following manner:
+ *
+ * struct SnapBuildOnDisk;
+ * TransactionId * running.xcnt_space;
+ * TransactionId * committed.xcnt; (*not xcnt_space*)
+ *
+ */
+typedef struct SnapBuildOnDisk
+{
+	uint32		magic;
+	/* how large is the SnapBuildOnDisk including all data in state */
+	Size		size;
+	SnapBuild	builder;
+
+	/* XXX: Should we store a CRC32? */
+
+	/* variable amount of TransactionId's */
+} SnapBuildOnDisk;
+
+#define SNAPBUILD_MAGIC 0x51A1E001
+
+/*
+ * Store/Load a snapshot from disk, depending on the snapshot builder's state.
+ *
+ * Supposed to be used by external (i.e. not snapbuild.c) code that just reada
+ * record that's a potential location for a serialized snapshot.
+ */
+void
+SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
+{
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		SnapBuildRestore(builder, lsn);
+	else
+		SnapBuildSerialize(builder, lsn);
+}
+
+/*
+ * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
+ * been done by another decoding process.
+ */
+static void
+SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
+{
+	Size		needed_size;
+	SnapBuildOnDisk *ondisk;
+	char	   *ondisk_c;
+	int			fd;
+	char		tmppath[MAXPGPATH];
+	char		path[MAXPGPATH];
+	int			ret;
+	struct stat stat_buf;
+
+	needed_size = sizeof(SnapBuildOnDisk) +
+		sizeof(TransactionId) * builder->running.xcnt_space +
+		sizeof(TransactionId) * builder->committed.xcnt;
+
+	Assert(lsn != InvalidXLogRecPtr);
+	Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
+		   builder->last_serialized_snapshot <= lsn);
+
+	/*
+	 * no point in serializing if we cannot continue to work immediately after
+	 * restoring the snapshot
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	/*
+	 * FIXME: Timeline handling/naming.
+	 */
+
+	/*
+	 * first check whether some other backend already has written the snapshot
+	 * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
+	 * as a valid state. Everything else is an unexpected error.
+	 */
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	ret = stat(path, &stat_buf);
+
+	if (ret != 0 && errno != ENOENT)
+		ereport(ERROR, (errmsg("could not stat snapbuild state file %s", path)));
+	else if (ret == 0)
+	{
+		/*
+		 * somebody else has already serialized to this point, don't overwrite
+		 * but remember location, so we don't need to read old data again.
+		 *
+		 * FIXME: Is it safe to set this as restartpoint below? While we can
+		 * see the file it's not guaranteed to persist after a crash...
+		 */
+		builder->last_serialized_snapshot = lsn;
+		goto out;
+	}
+
+	/*
+	 * there is an obvious race condition here between the time we stat(2) the
+	 * file and us writing the file. But we rename the file into place
+	 * atomically and all files created need to contain the same data anyway,
+	 * so this is perfectly fine, although a bit of a resource waste. Locking
+	 * seems like pointless complication.
+	 */
+	elog(DEBUG1, "serializing snapshot to %s", path);
+
+	/* to make sure only we will write to this tempfile, include pid */
+	sprintf(tmppath, "pg_llog/snapshots/%X-%X.snap.%u.tmp",
+			(uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
+
+	/*
+	 * Unlink temporary file if it already exists, needs to have been before a
+	 * crash/error since we won't enter this function twice from within a
+	 * single decoding slot/backend and the temporary file contains the pid of
+	 * the current process.
+	 */
+	if (unlink(tmppath) != 0 && errno != ENOENT)
+		ereport(ERROR, (errmsg("could not unlink old snapbuild state file %s", path)));
+
+	ondisk = MemoryContextAllocZero(builder->context, needed_size);
+	ondisk_c = ((char *) ondisk) + sizeof(SnapBuildOnDisk);
+	ondisk->magic = SNAPBUILD_MAGIC;
+	ondisk->size = needed_size;
+
+	/* copy state per struct assignment, lalala lazy. */
+	ondisk->builder = *builder;
+
+	/* NULL-ify memory-only data */
+	ondisk->builder.context = NULL;
+	ondisk->builder.snapshot = NULL;
+	ondisk->builder.reorder = NULL;
+
+	/* copy running xacts */
+	memcpy(ondisk_c, builder->running.xip,
+		   sizeof(TransactionId) * builder->running.xcnt_space);
+	ondisk_c += sizeof(TransactionId) * builder->running.xcnt_space;
+
+	/* copy  committed xacts */
+	memcpy(ondisk_c, builder->committed.xip,
+		   sizeof(TransactionId) * builder->committed.xcnt);
+	ondisk_c += sizeof(TransactionId) * builder->committed.xcnt;
+
+	/* we have valid data now, open tempfile and write it there */
+	fd = OpenTransientFile(tmppath,
+						   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR, (errmsg("could not open snapbuild state file %s for writing: %m", path)));
+
+	if ((write(fd, ondisk, needed_size)) != needed_size)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to snapbuild state file \"%s\": %m",
+						tmppath)));
+	}
+
+	/*
+	 * fsync the file before renaming so that even if we crash after this we
+	 * have either a fully valid file or nothing.
+	 *
+	 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
+	 * some noticeable overhead since it's performed synchronously during
+	 * decoding?
+	 */
+	if (pg_fsync(fd) != 0)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync snapbuild state file \"%s\": %m",
+						tmppath)));
+	}
+
+	CloseTransientFile(fd);
+
+	/*
+	 * We may overwrite the work from some other backend, but that's ok, our
+	 * snapshot is valid as well.
+	 */
+	if (rename(tmppath, path) != 0)
+	{
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not rename snapbuild state file from \"%s\" to \"%s\": %m",
+						tmppath, path)));
+	}
+
+	/* make sure we persist */
+	fsync_fname(path, false);
+	fsync_fname("pg_llog/snapshots", true);
+
+	/*
+	 * now there's no way we loose the dumped state anymore, remember
+	 * serialization point.
+	 */
+	builder->last_serialized_snapshot = lsn;
+
+out:
+	ReorderBufferSetRestartPoint(builder->reorder,
+								 builder->last_serialized_snapshot);
+}
+
+/*
+ * Restore a snapshot into 'builder' if previously one has been stored at the
+ * location indicated by 'lsn'. Returns true if successfull, false otherwise.
+ */
+static bool
+SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
+{
+	SnapBuildOnDisk ondisk;
+	int			fd;
+	char		path[MAXPGPATH];
+	Size		sz;
+
+	/* no point in loading a snapshot if we're already there */
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return false;
+
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+	elog(LOG, "restoring snapbuild state from %s", path);
+
+	if (fd < 0 && errno == ENOENT)
+		return false;
+	else if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open snapbuild state file %s", path)));
+
+	elog(LOG, "really restoring from %s", path);
+
+	/* read statically sized portion of snapshot */
+	if (read(fd, &ondisk, sizeof(ondisk)) != sizeof(ondisk))
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read snapbuild file \"%s\": %m",
+						path)));
+	}
+
+	if (ondisk.magic != SNAPBUILD_MAGIC)
+		ereport(ERROR, (errmsg("snapbuild state file has wrong magic %u instead of %u",
+							   ondisk.magic, SNAPBUILD_MAGIC)));
+
+	/* restore running xact information */
+	sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
+	ondisk.builder.running.xip = MemoryContextAlloc(builder->context, sz);
+	if (read(fd, ondisk.builder.running.xip, sz) != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+		errmsg("could not read running xacts from snapbuild file \"%s\": %m",
+			   path)));
+	}
+
+	/* restore running xact information */
+	sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
+	ondisk.builder.committed.xip = MemoryContextAlloc(builder->context, sz);
+	if (read(fd, ondisk.builder.committed.xip, sz) != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read committed xacts from snapbuild file \"%s\": %m",
+						path)));
+	}
+
+	CloseTransientFile(fd);
+
+	/*
+	 * ok, we now have a sensible snapshot here, figure out if it has more
+	 * information than we have.
+	 */
+
+	/*
+	 * We are only interested in consistent snapshots for now, comparing
+	 * whether one imcomplete snapshot is more "advanced" seems to be
+	 * unnecessarily complex.
+	 */
+	if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
+		goto snapshot_not_interesting;
+
+	/*
+	 * Don't use a snapshot that requires an xmin that we cannot guarantee to
+	 * be available.
+	 */
+	if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
+		goto snapshot_not_interesting;
+
+	/*
+	 * XXX: transactions_after needs to be updated differently, to be checked
+	 * here
+	 */
+
+	/* ok, we think the snapshot is sensible, copy over everything important */
+	builder->xmin = ondisk.builder.xmin;
+	builder->xmax = ondisk.builder.xmax;
+	builder->state = ondisk.builder.state;
+
+	builder->committed.xcnt = ondisk.builder.committed.xcnt;
+	/* We only allocated/stored xcnt, not xcnt_space xids ! */
+	/* don't overwrite preallocated xip, if we don't have anything here */
+	if (builder->committed.xcnt > 0)
+	{
+		pfree(builder->committed.xip);
+		builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
+		builder->committed.xip = ondisk.builder.committed.xip;
+	}
+	ondisk.builder.committed.xip = NULL;
+
+	builder->running.xcnt = ondisk.builder.committed.xcnt;
+	if (builder->running.xip)
+		pfree(builder->running.xip);
+	builder->running.xcnt_space = ondisk.builder.committed.xcnt_space;
+	builder->running.xip = ondisk.builder.running.xip;
+
+	/* our snapshot is not interesting anymore, build a new one */
+	if (builder->snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(builder->snapshot);
+	}
+	builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
+	SnapBuildSnapIncRefcount(builder->snapshot);
+
+	ReorderBufferSetRestartPoint(builder->reorder, lsn);
+
+	Assert(builder->state == SNAPBUILD_CONSISTENT);
+	elog(LOG, "recovered initial snapshot (xmin %u) from disk",	 builder->xmin);
+
+	return true;
+
+snapshot_not_interesting:
+	if (ondisk.builder.running.xip != NULL)
+		pfree(ondisk.builder.running.xip);
+	if (ondisk.builder.committed.xip != NULL)
+		pfree(ondisk.builder.committed.xip);
+	return false;
+}
diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y
index 8c83780..0d64156 100644
--- a/src/backend/replication/repl_gram.y
+++ b/src/backend/replication/repl_gram.y
@@ -65,7 +65,7 @@ Node *replication_parse_result;
 }
 
 /* Non-keyword tokens */
-%token <str> SCONST
+%token <str> SCONST IDENT
 %token <uintval> UCONST
 %token <recptr> RECPTR
 
@@ -73,6 +73,9 @@ Node *replication_parse_result;
 %token K_BASE_BACKUP
 %token K_IDENTIFY_SYSTEM
 %token K_START_REPLICATION
+%token K_INIT_LOGICAL_REPLICATION
+%token K_START_LOGICAL_REPLICATION
+%token K_FREE_LOGICAL_REPLICATION
 %token K_TIMELINE_HISTORY
 %token K_LABEL
 %token K_PROGRESS
@@ -82,10 +85,13 @@ Node *replication_parse_result;
 %token K_TIMELINE
 
 %type <node>	command
-%type <node>	base_backup start_replication identify_system timeline_history
+%type <node>	base_backup start_replication start_logical_replication init_logical_replication free_logical_replication identify_system timeline_history
 %type <list>	base_backup_opt_list
 %type <defelt>	base_backup_opt
 %type <uintval>	opt_timeline
+%type <list>	plugin_options plugin_opt_list
+%type <defelt>	plugin_opt_elem
+%type <node>	plugin_opt_arg
 %%
 
 firstcmd: command opt_semicolon
@@ -102,6 +108,9 @@ command:
 			identify_system
 			| base_backup
 			| start_replication
+			| init_logical_replication
+			| start_logical_replication
+			| free_logical_replication
 			| timeline_history
 			;
 
@@ -186,6 +195,67 @@ opt_timeline:
 				| /* nothing */			{ $$ = 0; }
 			;
 
+init_logical_replication:
+			K_INIT_LOGICAL_REPLICATION IDENT IDENT
+				{
+					InitLogicalReplicationCmd *cmd;
+					cmd = makeNode(InitLogicalReplicationCmd);
+					cmd->name = $2;
+					cmd->plugin = $3;
+					$$ = (Node *) cmd;
+				}
+			;
+
+start_logical_replication:
+			K_START_LOGICAL_REPLICATION IDENT RECPTR plugin_options
+				{
+					StartLogicalReplicationCmd *cmd;
+					cmd = makeNode(StartLogicalReplicationCmd);
+					cmd->name = $2;
+					cmd->startpoint = $3;
+					cmd->options = $4;
+					$$ = (Node *) cmd;
+				}
+			;
+
+plugin_options:
+			'(' plugin_opt_list ')'			{ $$ = $2; }
+			| /* EMPTY */					{ $$ = NIL; }
+		;
+
+plugin_opt_list:
+			plugin_opt_elem
+				{
+					$$ = list_make1($1);
+				}
+			| plugin_opt_list ',' plugin_opt_elem
+				{
+					$$ = lappend($1, $3);
+				}
+		;
+
+plugin_opt_elem:
+			IDENT plugin_opt_arg
+				{
+					$$ = makeDefElem($1, $2);
+				}
+		;
+
+plugin_opt_arg:
+			SCONST							{ $$ = (Node *) makeString($1); }
+			| /* EMPTY */					{ $$ = NULL; }
+		;
+
+free_logical_replication:
+			K_FREE_LOGICAL_REPLICATION IDENT
+				{
+					FreeLogicalReplicationCmd *cmd;
+					cmd = makeNode(FreeLogicalReplicationCmd);
+					cmd->name = $2;
+					$$ = (Node *) cmd;
+				}
+			;
+
 /*
  * TIMELINE_HISTORY %d
  */
@@ -205,6 +275,7 @@ timeline_history:
 					$$ = (Node *) cmd;
 				}
 			;
+
 %%
 
 #include "repl_scanner.c"
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index 3d930f1..2b0f2ff 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -16,6 +16,7 @@
 #include "postgres.h"
 
 #include "utils/builtins.h"
+#include "parser/scansup.h"
 
 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
 #undef fprintf
@@ -48,7 +49,7 @@ static void addlitchar(unsigned char ychar);
 %option warn
 %option prefix="replication_yy"
 
-%x xq
+%x xq xd
 
 /* Extended quote
  * xqdouble implements embedded quote, ''''
@@ -57,12 +58,26 @@ xqstart			{quote}
 xqdouble		{quote}{quote}
 xqinside		[^']+
 
+/* Double quote
+ * Allows embedded spaces and other special characters into identifiers.
+ */
+dquote			\"
+xdstart			{dquote}
+xdstop			{dquote}
+xddouble		{dquote}{dquote}
+xdinside		[^"]+
+
 digit			[0-9]+
 hexdigit		[0-9A-Za-z]+
 
 quote			'
 quotestop		{quote}
 
+ident_start		[A-Za-z\200-\377_]
+ident_cont		[A-Za-z\200-\377_0-9\$]
+
+identifier		{ident_start}{ident_cont}*
+
 %%
 
 BASE_BACKUP			{ return K_BASE_BACKUP; }
@@ -74,9 +89,14 @@ PROGRESS			{ return K_PROGRESS; }
 WAL			{ return K_WAL; }
 TIMELINE			{ return K_TIMELINE; }
 START_REPLICATION	{ return K_START_REPLICATION; }
+INIT_LOGICAL_REPLICATION	{ return K_INIT_LOGICAL_REPLICATION; }
+START_LOGICAL_REPLICATION	{ return K_START_LOGICAL_REPLICATION; }
+FREE_LOGICAL_REPLICATION	{ return K_FREE_LOGICAL_REPLICATION; }
 TIMELINE_HISTORY	{ return K_TIMELINE_HISTORY; }
 ","				{ return ','; }
 ";"				{ return ';'; }
+"("				{ return '('; }
+")"				{ return ')'; }
 
 [\n]			;
 [\t]			;
@@ -100,20 +120,49 @@ TIMELINE_HISTORY	{ return K_TIMELINE_HISTORY; }
 					BEGIN(xq);
 					startlit();
 				}
+
 <xq>{quotestop}	{
 					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return SCONST;
 				}
-<xq>{xqdouble} {
+
+<xq>{xqdouble}	{
 					addlitchar('\'');
 				}
+
 <xq>{xqinside}  {
 					addlit(yytext, yyleng);
 				}
 
-<xq><<EOF>>		{ yyerror("unterminated quoted string"); }
+{xdstart}		{
+					BEGIN(xd);
+					startlit();
+				}
+
+<xd>{xdstop}	{
+					int len;
+					yyless(1);
+					BEGIN(INITIAL);
+					yylval.str = litbufdup();
+					len = strlen(yylval.str);
+					truncate_identifier(yylval.str, len, true);
+					return IDENT;
+				}
+
+<xd>{xdinside}  {
+					addlit(yytext, yyleng);
+				}
+
+{identifier}	{
+					int len = strlen(yytext);
+
+					yylval.str = downcase_truncate_identifier(yytext, len, true);
+					return IDENT;
+				}
+
+<xq,xd><<EOF>>	{ yyerror("unterminated quoted string"); }
 
 
 <<EOF>>			{
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 413f0b9..e73f566 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1137,7 +1137,7 @@ XLogWalRcvSendHSFeedback(bool immed)
 	 * everything else has been checked.
 	 */
 	if (hot_standby_feedback)
-		xmin = GetOldestXmin(true, false);
+		xmin = GetOldestXmin(true, true, false, false);
 	else
 		xmin = InvalidTransactionId;
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index b00a91a..2187d96 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -45,9 +45,8 @@
 
 #include "access/timeline.h"
 #include "access/transam.h"
-#include "access/xlog_internal.h"
 #include "access/xact.h"
-
+#include "access/xlog_internal.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
 #include "funcapi.h"
@@ -56,6 +55,10 @@
 #include "miscadmin.h"
 #include "nodes/replnodes.h"
 #include "replication/basebackup.h"
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
+#include "replication/snapbuild.h"
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
@@ -157,6 +160,9 @@ static bool ping_sent = false;
 static bool streamingDoneSending;
 static bool streamingDoneReceiving;
 
+/* Are we there yet? */
+static bool		WalSndCaughtUp = false;
+
 /* Flags set by signal handlers for later service in main loop */
 static volatile sig_atomic_t got_SIGHUP = false;
 static volatile sig_atomic_t walsender_ready_to_stop = false;
@@ -169,24 +175,42 @@ static volatile sig_atomic_t walsender_ready_to_stop = false;
  */
 static volatile sig_atomic_t replication_active = false;
 
+/* XXX reader */
+static MemoryContext decoding_ctx = NULL;
+static MemoryContext old_decoding_ctx = NULL;
+
+static LogicalDecodingContext *logical_decoding_ctx = NULL;
+static XLogRecPtr  logical_startptr = InvalidXLogRecPtr;
+
 /* Signal handlers */
 static void WalSndSigHupHandler(SIGNAL_ARGS);
 static void WalSndXLogSendHandler(SIGNAL_ARGS);
 static void WalSndLastCycleHandler(SIGNAL_ARGS);
 
 /* Prototypes for private functions */
-static void WalSndLoop(void);
+typedef void (*WalSndSendData)(void);
+static void WalSndLoop(WalSndSendData send_data);
 static void InitWalSenderSlot(void);
 static void WalSndKill(int code, Datum arg);
-static void XLogSend(bool *caughtup);
+static void XLogSendPhysical(void);
+static void XLogSendLogical(void);
+static void WalSndDone(WalSndSendData send_data);
 static XLogRecPtr GetStandbyFlushRecPtr(void);
 static void IdentifySystem(void);
 static void StartReplication(StartReplicationCmd *cmd);
+static void InitLogicalReplication(InitLogicalReplicationCmd *cmd);
+static void StartLogicalReplication(StartLogicalReplicationCmd *cmd);
+static void FreeLogicalReplication(FreeLogicalReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
 static void ProcessRepliesIfAny(void);
 static void WalSndKeepalive(bool requestReply);
+static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid);
+static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid);
+static void XLogRead(char *buf, XLogRecPtr startptr, Size count);
+
+
 
 
 /* Initialize walsender process before entering the main command loop */
@@ -247,14 +271,13 @@ IdentifySystem(void)
 	char		tli[11];
 	char		xpos[MAXFNAMELEN];
 	XLogRecPtr	logptr;
-	char*        dbname = NULL;
+	char	   *dbname = NULL;
 
 	/*
 	 * Reply with a result set with one row, four columns. First col is system
 	 * ID, second is timeline ID, third is current xlog location and the fourth
 	 * contains the database name if we are connected to one.
 	 */
-
 	snprintf(sysid, sizeof(sysid), UINT64_FORMAT,
 			 GetSystemIdentifier());
 
@@ -308,22 +331,22 @@ IdentifySystem(void)
 	pq_sendint(&buf, 0, 2);		/* format code */
 
 	/* third field */
-	pq_sendstring(&buf, "xlogpos");
-	pq_sendint(&buf, 0, 4);
-	pq_sendint(&buf, 0, 2);
-	pq_sendint(&buf, TEXTOID, 4);
-	pq_sendint(&buf, -1, 2);
-	pq_sendint(&buf, 0, 4);
-	pq_sendint(&buf, 0, 2);
+	pq_sendstring(&buf, "xlogpos");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);		/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
 
 	/* fourth field */
-	pq_sendstring(&buf, "dbname");
-	pq_sendint(&buf, 0, 4);
-	pq_sendint(&buf, 0, 2);
-	pq_sendint(&buf, TEXTOID, 4);
-	pq_sendint(&buf, -1, 2);
-	pq_sendint(&buf, 0, 4);
-	pq_sendint(&buf, 0, 2);
+	pq_sendstring(&buf, "dbname");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);		/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
 	pq_endmessage(&buf);
 
 	/* Send a DataRow message */
@@ -335,9 +358,16 @@ IdentifySystem(void)
 	pq_sendbytes(&buf, (char *) tli, strlen(tli));
 	pq_sendint(&buf, strlen(xpos), 4);	/* col3 len */
 	pq_sendbytes(&buf, (char *) xpos, strlen(xpos));
-	pq_sendint(&buf, strlen(dbname), 4);	/* col4 len */
-	pq_sendbytes(&buf, (char *) dbname, strlen(dbname));
-
+	/* send NULL if not connected to a database */
+	if (dbname)
+	{
+		pq_sendint(&buf, strlen(dbname), 4);	/* col4 len */
+		pq_sendbytes(&buf, (char *) dbname, strlen(dbname));
+	}
+	else
+	{
+		pq_sendint(&buf, -1, 4);	/* col4 len */
+	}
 	pq_endmessage(&buf);
 }
 
@@ -586,7 +616,7 @@ StartReplication(StartReplicationCmd *cmd)
 		/* Main loop of walsender */
 		replication_active = true;
 
-		WalSndLoop();
+		WalSndLoop(XLogSendPhysical);
 
 		replication_active = false;
 		if (walsender_ready_to_stop)
@@ -653,6 +683,497 @@ StartReplication(StartReplicationCmd *cmd)
 	pq_puttextmessage('C', "START_STREAMING");
 }
 
+static int
+replay_read_page(XLogReaderState* state, XLogRecPtr targetPagePtr, int reqLen,
+				 XLogRecPtr targetRecPtr, char* cur_page, TimeLineID *pageTLI)
+{
+	XLogRecPtr flushptr;
+	int		count;
+
+	flushptr = WalSndWaitForWal(targetPagePtr + reqLen);
+
+	/* more than one block available */
+	if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+		count = XLOG_BLCKSZ;
+	/* not enough data there */
+	else if (targetPagePtr + reqLen > flushptr)
+		return -1;
+	/* part of the page available */
+	else
+		count = flushptr - targetPagePtr;
+
+	/* FIXME: more sensible/efficient implementation */
+	XLogRead(cur_page, targetPagePtr, XLOG_BLCKSZ);
+
+	return count;
+}
+
+/*
+ * Initialize logical replication and wait for an initial consistent point to
+ * start sending changes from.
+ */
+static void
+InitLogicalReplication(InitLogicalReplicationCmd *cmd)
+{
+	const char *slot_name;
+	StringInfoData buf;
+	char		xpos[MAXFNAMELEN];
+	const char *snapshot_name = NULL;
+	LogicalDecodingContext *ctx;
+	XLogRecPtr startptr;
+
+	CheckLogicalReplicationRequirements();
+
+	Assert(!MyLogicalDecodingSlot);
+
+	/* XXX apply sanity checking to slot name? */
+	LogicalDecodingAcquireFreeSlot(cmd->name, cmd->plugin);
+
+	Assert(MyLogicalDecodingSlot);
+
+	decoding_ctx = AllocSetContextCreate(TopMemoryContext,
+										 "decoding context",
+										 ALLOCSET_DEFAULT_MINSIZE,
+										 ALLOCSET_DEFAULT_INITSIZE,
+										 ALLOCSET_DEFAULT_MAXSIZE);
+	old_decoding_ctx = MemoryContextSwitchTo(decoding_ctx);
+
+	/* setup state for XLogReadPage */
+	sendTimeLineIsHistoric = false;
+	sendTimeLine = ThisTimeLineID;
+
+	initStringInfo(&output_message);
+	ctx = CreateLogicalDecodingContext(MyLogicalDecodingSlot, false, InvalidXLogRecPtr,
+									   NIL,	replay_read_page,
+									   WalSndPrepareWrite, WalSndWriteData);
+
+	MemoryContextSwitchTo(old_decoding_ctx);
+
+	startptr = MyLogicalDecodingSlot->restart_decoding;
+
+	elog(WARNING, "Initiating logical rep from %X/%X",
+		 (uint32)(startptr >> 32), (uint32)startptr);
+
+	for (;;)
+	{
+		XLogRecord *record;
+		XLogRecordBuffer buf;
+		char *err = NULL;
+
+		/* the read_page callback waits for new WAL */
+		record = XLogReadRecord(ctx->reader, startptr, &err);
+		/* xlog record was invalid */
+		if (err)
+			elog(ERROR, "%s", err);
+
+		/* read up from last position next time round */
+		startptr = InvalidXLogRecPtr;
+
+		Assert(record);
+
+		buf.origptr = ctx->reader->ReadRecPtr;
+		buf.endptr = ctx->reader->EndRecPtr;
+		buf.record = *record;
+		buf.record_data = XLogRecGetData(record);
+		DecodeRecordIntoReorderBuffer(ctx, &buf);
+
+		/* only continue till we found a consistent spot */
+		if (LogicalDecodingContextReady(ctx))
+		{
+			/* export plain, importable, snapshot to the user */
+			snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder);
+			break;
+		}
+	}
+
+	MyLogicalDecodingSlot->confirmed_flush = ctx->reader->EndRecPtr;
+	slot_name = NameStr(MyLogicalDecodingSlot->name);
+	snprintf(xpos, sizeof(xpos), "%X/%X",
+			 (uint32) (MyLogicalDecodingSlot->confirmed_flush >> 32),
+			 (uint32) MyLogicalDecodingSlot->confirmed_flush);
+
+	pq_beginmessage(&buf, 'T');
+	pq_sendint(&buf, 4, 2);		/* 4 fields */
+
+	/* first field */
+	pq_sendstring(&buf, "replication_id");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);	/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
+
+	pq_sendstring(&buf, "consistent_point");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);	/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
+
+	pq_sendstring(&buf, "snapshot_name");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);	/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
+
+	pq_sendstring(&buf, "plugin");	/* col name */
+	pq_sendint(&buf, 0, 4);		/* table oid */
+	pq_sendint(&buf, 0, 2);		/* attnum */
+	pq_sendint(&buf, TEXTOID, 4);		/* type oid */
+	pq_sendint(&buf, -1, 2);	/* typlen */
+	pq_sendint(&buf, 0, 4);		/* typmod */
+	pq_sendint(&buf, 0, 2);		/* format code */
+
+	pq_endmessage(&buf);
+
+	/* Send a DataRow message */
+	pq_beginmessage(&buf, 'D');
+	pq_sendint(&buf, 4, 2);		/* # of columns */
+
+	/* replication_id */
+	pq_sendint(&buf, strlen(slot_name), 4); /* col1 len */
+	pq_sendbytes(&buf, slot_name, strlen(slot_name));
+
+	/* consistent wal location */
+	pq_sendint(&buf, strlen(xpos), 4); /* col2 len */
+	pq_sendbytes(&buf, xpos, strlen(xpos));
+
+	/* snapshot name */
+	pq_sendint(&buf, strlen(snapshot_name), 4); /* col3 len */
+	pq_sendbytes(&buf, snapshot_name, strlen(snapshot_name));
+
+	/* plugin */
+	pq_sendint(&buf, strlen(cmd->plugin), 4); /* col4 len */
+	pq_sendbytes(&buf, cmd->plugin, strlen(cmd->plugin));
+
+	pq_endmessage(&buf);
+
+	/*
+	 * release active status again, START_LOGICAL_REPLICATION will reacquire it
+	 */
+	LogicalDecodingReleaseSlot();
+}
+
+/*
+ * Load previously initiated logical slot and prepare for sending data (via
+ * WalSndLoop).
+ */
+static void
+StartLogicalReplication(StartLogicalReplicationCmd *cmd)
+{
+	StringInfoData buf;
+	XLogRecPtr confirmed_flush;
+
+	elog(WARNING, "Starting logical replication from %x/%x",
+		 (uint32)(cmd->startpoint >> 32), (uint32)cmd->startpoint);
+
+	/* make sure that our requirements are still fulfilled */
+	CheckLogicalReplicationRequirements();
+
+	Assert(!MyLogicalDecodingSlot);
+
+	LogicalDecodingReAcquireSlot(cmd->name);
+
+	if (am_cascading_walsender && !RecoveryInProgress())
+	{
+		ereport(LOG,
+				(errmsg("terminating walsender process to force cascaded standby to update timeline and reconnect")));
+		walsender_ready_to_stop = true;
+	}
+
+	WalSndSetState(WALSNDSTATE_CATCHUP);
+
+	/* Send a CopyBothResponse message, and start streaming */
+	pq_beginmessage(&buf, 'W');
+	pq_sendbyte(&buf, 0);
+	pq_sendint(&buf, 0, 2);
+	pq_endmessage(&buf);
+	pq_flush();
+
+	/* setup state for XLogReadPage */
+	sendTimeLineIsHistoric = false;
+	sendTimeLine = ThisTimeLineID;
+
+	confirmed_flush = MyLogicalDecodingSlot->confirmed_flush;
+
+	Assert(confirmed_flush != InvalidXLogRecPtr);
+
+	/* continue from last position */
+	if (cmd->startpoint == InvalidXLogRecPtr)
+		cmd->startpoint = MyLogicalDecodingSlot->confirmed_flush;
+	else if (cmd->startpoint > MyLogicalDecodingSlot->confirmed_flush)
+		elog(ERROR, "cannot stream from %X/%X, minimum is %X/%X",
+			 (uint32)(cmd->startpoint >> 32), (uint32)cmd->startpoint,
+			 (uint32)(confirmed_flush >> 32), (uint32)confirmed_flush);
+
+	/*
+	 * Initialize position to the last ack'ed one, then the xlog records begin
+	 * to be shipped from that position.
+	 */
+	logical_decoding_ctx = CreateLogicalDecodingContext(
+		MyLogicalDecodingSlot, false, cmd->startpoint, cmd->options,
+		replay_read_page, WalSndPrepareWrite, WalSndWriteData);
+
+	/*
+	 * XXX: For feedback purposes it would be nicer to set sentPtr to
+	 * cmd->startpoint, but we use it to know where to read xlog in the main
+	 * loop...
+	 */
+	sentPtr = MyLogicalDecodingSlot->restart_decoding;
+	logical_startptr = sentPtr;
+
+	/* Also update the start position status in shared memory */
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalSnd *walsnd = MyWalSnd;
+
+		SpinLockAcquire(&walsnd->mutex);
+		walsnd->sentPtr = MyLogicalDecodingSlot->restart_decoding;
+		SpinLockRelease(&walsnd->mutex);
+	}
+
+	elog(LOG, "starting to decode from %X/%X, replay %X/%X",
+		 (uint32)(MyWalSnd->sentPtr >> 32), (uint32)MyWalSnd->sentPtr,
+		 (uint32)(cmd->startpoint >> 32), (uint32)cmd->startpoint);
+
+	replication_active = true;
+
+	SyncRepInitConfig();
+
+	/* Main loop of walsender */
+	WalSndLoop(XLogSendLogical);
+
+	LogicalDecodingReleaseSlot();
+
+	replication_active = false;
+	if (walsender_ready_to_stop)
+		proc_exit(0);
+	WalSndSetState(WALSNDSTATE_STARTUP);
+
+	/* Get out of COPY mode (CommandComplete). */
+	EndCommand("COPY 0", DestRemote);
+}
+
+/*
+ * Free permanent state by a now inactive but defined logical slot.
+ */
+static void
+FreeLogicalReplication(FreeLogicalReplicationCmd *cmd)
+{
+	CheckLogicalReplicationRequirements();
+	LogicalDecodingFreeSlot(cmd->name);
+	EndCommand("FREE_LOGICAL_REPLICATION", DestRemote);
+}
+
+/*
+ * LogicalDecodingContext 'prepare_write' callback.
+ *
+ * Prepare a write into a StringInfo.
+ *
+ * Don't do anything lasting in here, it's quite possible that nothing will done
+ * with the data.
+ */
+static void
+WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid)
+{
+	AssertVariableIsOfType(&WalSndPrepareWrite, LogicalOutputPluginWriterPrepareWrite);
+
+	resetStringInfo(ctx->out);
+
+	pq_sendbyte(ctx->out, 'w');
+	pq_sendint64(ctx->out, lsn);	/* dataStart */
+	/* XXX: overwrite when data is assembled */
+	pq_sendint64(ctx->out, lsn);	/* walEnd */
+	/* XXX: gather that value later just as it's done in XLogSendPhysical */
+	pq_sendint64(ctx->out, 0 /*GetCurrentIntegerTimestamp() */);/* sendtime */
+}
+
+/*
+ * LogicalDecodingContext 'write' callback.
+ *
+ * Actually write out data previously prepared by WalSndPrepareWrite out to the
+ * network, take as long as needed but process replies from the other side
+ * during that.
+ */
+static void
+WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid)
+{
+	AssertVariableIsOfType(&WalSndWriteData, LogicalOutputPluginWriterWrite);
+
+	/* output previously gathered data in a CopyData packet */
+	pq_putmessage_noblock('d', ctx->out->data, ctx->out->len);
+
+	/* fast path */
+	/* Try to flush pending output to the client */
+	if (pq_flush_if_writable() != 0)
+		return;
+
+	if (!pq_is_send_pending())
+		return;
+
+	for (;;)
+	{
+		int			wakeEvents;
+		long		sleeptime = 10000;		/* 10s */
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive())
+			exit(1);
+
+		/* Process any requests or signals received recently */
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Check for input from the client */
+		ProcessRepliesIfAny();
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(&MyWalSnd->latch);
+
+		/* Try to flush pending output to the client */
+		if (pq_flush_if_writable() != 0)
+			break;
+
+		/* If we finished clearing the buffered data, we're done here. */
+		if (!pq_is_send_pending())
+			break;
+
+		/*
+		 * Note we don't set a timeout here.  It would be pointless, because
+		 * if the socket is not writable there's not much we can do elsewhere
+		 * anyway.
+		 */
+		wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
+			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
+
+		ImmediateInterruptOK = true;
+		CHECK_FOR_INTERRUPTS();
+		WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents,
+						  MyProcPort->sock, sleeptime);
+		ImmediateInterruptOK = false;
+	}
+
+	/* reactivate latch so WalSndLoop knows to continue */
+	SetLatch(&MyWalSnd->latch);
+}
+
+/*
+ * Wait till WAL < loc is flushed to disk so it can be safely read.
+ */
+XLogRecPtr
+WalSndWaitForWal(XLogRecPtr loc)
+{
+	int			wakeEvents;
+	XLogRecPtr  flushptr;
+
+	/* fast path if everything is there already */
+	/*
+	 * XXX: introduce RecentFlushPtr to avoid acquiring the spinlock in the
+	 * fast path case where we already know we have enough WAL available.
+	 */
+	flushptr = GetFlushRecPtr();
+	if (loc <= flushptr)
+		return flushptr;
+
+	for (;;)
+	{
+		long		sleeptime = 10000;		/* 10 s */
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive())
+			exit(1);
+
+		/* Process any requests or signals received recently */
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Check for input from the client */
+		ProcessRepliesIfAny();
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(&MyWalSnd->latch);
+
+		/* Update our idea of flushed position. */
+		flushptr = GetFlushRecPtr();
+
+		/* If postmaster asked us to stop, don't wait here anymore */
+		if (walsender_ready_to_stop)
+			break;
+
+		/* check whether we're done */
+		if (loc <= flushptr)
+			break;
+
+		/* Determine time until replication timeout */
+		if (wal_sender_timeout > 0)
+		{
+			if (!ping_sent)
+			{
+				TimestampTz timeout;
+
+				/*
+				 * If half of wal_sender_timeout has lapsed without receiving
+				 * any reply from standby, send a keep-alive message to standby
+				 * requesting an immediate reply.
+				 */
+				timeout = TimestampTzPlusMilliseconds(last_reply_timestamp,
+													  wal_sender_timeout / 2);
+				if (GetCurrentTimestamp() >= timeout)
+				{
+					WalSndKeepalive(true);
+					ping_sent = true;
+					/* Try to flush pending output to the client */
+					if (pq_flush_if_writable() != 0)
+						break;
+				}
+			}
+
+			sleeptime = 1 + (wal_sender_timeout / 10);
+		}
+
+		wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
+			WL_SOCKET_READABLE | WL_TIMEOUT;
+
+		ImmediateInterruptOK = true;
+		CHECK_FOR_INTERRUPTS();
+		WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents,
+						  MyProcPort->sock, sleeptime);
+		ImmediateInterruptOK = false;
+
+		/*
+		 * The equivalent code in WalSndLoop checks here that replication
+		 * timeout hasn't been exceeded.  We don't do that here.   XXX explain
+		 * why.
+		 */
+	}
+
+	/* reactivate latch so WalSndLoop knows to continue */
+	SetLatch(&MyWalSnd->latch);
+	return flushptr;
+}
+
 /*
  * Execute an incoming replication command.
  */
@@ -664,6 +1185,12 @@ exec_replication_command(const char *cmd_string)
 	MemoryContext cmd_context;
 	MemoryContext old_context;
 
+	/*
+	 * INIT_LOGICAL_REPLICATION exports a snapshot until the next command
+	 * arrives. Clean up the old stuff if there's anything.
+	 */
+	SnapBuildClearExportedSnapshot();
+
 	elog(DEBUG1, "received replication command: %s", cmd_string);
 
 	CHECK_FOR_INTERRUPTS();
@@ -695,6 +1222,18 @@ exec_replication_command(const char *cmd_string)
 			StartReplication((StartReplicationCmd *) cmd_node);
 			break;
 
+		case T_InitLogicalReplicationCmd:
+			InitLogicalReplication((InitLogicalReplicationCmd *) cmd_node);
+			break;
+
+		case T_StartLogicalReplicationCmd:
+			StartLogicalReplication((StartLogicalReplicationCmd *) cmd_node);
+			break;
+
+		case T_FreeLogicalReplicationCmd:
+			FreeLogicalReplication((FreeLogicalReplicationCmd *) cmd_node);
+			break;
+
 		case T_BaseBackupCmd:
 			SendBaseBackup((BaseBackupCmd *) cmd_node);
 			break;
@@ -904,6 +1443,12 @@ ProcessStandbyReplyMessage(void)
 		SpinLockRelease(&walsnd->mutex);
 	}
 
+	/*
+	 * Advance our local xmin horizon when the client confirmed a flush.
+	 */
+	if (MyLogicalDecodingSlot && flushPtr != InvalidXLogRecPtr)
+		LogicalConfirmReceivedLocation(flushPtr);
+
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 }
@@ -988,10 +1533,8 @@ ProcessStandbyHSFeedbackMessage(void)
 
 /* Main loop of walsender process that streams the WAL over Copy messages. */
 static void
-WalSndLoop(void)
+WalSndLoop(WalSndSendData send_data)
 {
-	bool		caughtup = false;
-
 	/*
 	 * Allocate buffers that will be used for each outgoing and incoming
 	 * message.  We do this just once to reduce palloc overhead.
@@ -1043,21 +1586,21 @@ WalSndLoop(void)
 
 		/*
 		 * If we don't have any pending data in the output buffer, try to send
-		 * some more.  If there is some, we don't bother to call XLogSend
+		 * some more.  If there is some, we don't bother to call send_data
 		 * again until we've flushed it ... but we'd better assume we are not
 		 * caught up.
 		 */
 		if (!pq_is_send_pending())
-			XLogSend(&caughtup);
+			send_data();
 		else
-			caughtup = false;
+			WalSndCaughtUp = false;
 
 		/* Try to flush pending output to the client */
 		if (pq_flush_if_writable() != 0)
 			goto send_failure;
 
 		/* If nothing remains to be sent right now ... */
-		if (caughtup && !pq_is_send_pending())
+		if (WalSndCaughtUp && !pq_is_send_pending())
 		{
 			/*
 			 * If we're in catchup state, move to streaming.  This is an
@@ -1083,29 +1626,17 @@ WalSndLoop(void)
 			 * the walsender is not sure which.
 			 */
 			if (walsender_ready_to_stop)
-			{
-				/* ... let's just be real sure we're caught up ... */
-				XLogSend(&caughtup);
-				if (caughtup && sentPtr == MyWalSnd->flush &&
-					!pq_is_send_pending())
-				{
-					/* Inform the standby that XLOG streaming is done */
-					EndCommand("COPY 0", DestRemote);
-					pq_flush();
-
-					proc_exit(0);
-				}
-			}
+				WalSndDone(send_data);
 		}
 
 		/*
 		 * We don't block if not caught up, unless there is unsent data
 		 * pending in which case we'd better block until the socket is
-		 * write-ready.  This test is only needed for the case where XLogSend
+		 * write-ready.  This test is only needed for the case where send_data
 		 * loaded a subset of the available data but then pq_flush_if_writable
 		 * flushed it all --- we should immediately try to send more.
 		 */
-		if ((caughtup && !streamingDoneSending) || pq_is_send_pending())
+		if ((WalSndCaughtUp && !streamingDoneSending) || pq_is_send_pending())
 		{
 			TimestampTz timeout = 0;
 			long		sleeptime = 10000;		/* 10 s */
@@ -1434,15 +1965,17 @@ retry:
 }
 
 /*
+ * Send out the WAL in its normal physical/stored form.
+ *
  * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
  * but not yet sent to the client, and buffer it in the libpq output
  * buffer.
  *
- * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
- * *caughtup is set to false.
+ * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
+ * otherwise WalSndCaughtUp is set to false.
  */
 static void
-XLogSend(bool *caughtup)
+XLogSendPhysical(void)
 {
 	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
@@ -1451,7 +1984,7 @@ XLogSend(bool *caughtup)
 
 	if (streamingDoneSending)
 	{
-		*caughtup = true;
+		WalSndCaughtUp = true;
 		return;
 	}
 
@@ -1568,7 +2101,7 @@ XLogSend(bool *caughtup)
 		pq_putmessage_noblock('c', NULL, 0);
 		streamingDoneSending = true;
 
-		*caughtup = true;
+		WalSndCaughtUp = true;
 
 		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
 			 (uint32) (sendTimeLineValidUpto >> 32), (uint32) sendTimeLineValidUpto,
@@ -1580,7 +2113,7 @@ XLogSend(bool *caughtup)
 	Assert(sentPtr <= SendRqstPtr);
 	if (SendRqstPtr <= sentPtr)
 	{
-		*caughtup = true;
+		WalSndCaughtUp = true;
 		return;
 	}
 
@@ -1604,15 +2137,15 @@ XLogSend(bool *caughtup)
 	{
 		endptr = SendRqstPtr;
 		if (sendTimeLineIsHistoric)
-			*caughtup = false;
+			WalSndCaughtUp = false;
 		else
-			*caughtup = true;
+			WalSndCaughtUp = true;
 	}
 	else
 	{
 		/* round down to page boundary. */
 		endptr -= (endptr % XLOG_BLCKSZ);
-		*caughtup = false;
+		WalSndCaughtUp = false;
 	}
 
 	nbytes = endptr - startptr;
@@ -1673,6 +2206,96 @@ XLogSend(bool *caughtup)
 }
 
 /*
+ * Send out the WAL after it being decoded into a logical format by the output
+ * plugin specified in INIT_LOGICAL_DECODING
+ */
+static void
+XLogSendLogical(void)
+{
+	XLogRecord *record;
+	char	   *errm;
+
+	if (decoding_ctx == NULL)
+	{
+		decoding_ctx = AllocSetContextCreate(TopMemoryContext,
+											 "decoding context",
+											 ALLOCSET_DEFAULT_MINSIZE,
+											 ALLOCSET_DEFAULT_INITSIZE,
+											 ALLOCSET_DEFAULT_MAXSIZE);
+	}
+
+	record = XLogReadRecord(logical_decoding_ctx->reader, logical_startptr, &errm);
+	logical_startptr = InvalidXLogRecPtr;
+
+	/* xlog record was invalid */
+	if (errm != NULL)
+		elog(ERROR, "%s", errm);
+
+	if (record != NULL)
+	{
+		XLogRecordBuffer buf;
+
+		buf.origptr = logical_decoding_ctx->reader->ReadRecPtr;
+		buf.endptr = logical_decoding_ctx->reader->EndRecPtr;
+		buf.record = *record;
+		buf.record_data = XLogRecGetData(record);
+
+		old_decoding_ctx = MemoryContextSwitchTo(decoding_ctx);
+
+		DecodeRecordIntoReorderBuffer(logical_decoding_ctx, &buf);
+
+		MemoryContextSwitchTo(old_decoding_ctx);
+
+		/*
+		 * If the record we just read is at or beyond the flushed point, then
+		 * we're caught up.
+		 */
+		WalSndCaughtUp =
+			logical_decoding_ctx->reader->EndRecPtr >= GetFlushRecPtr();
+	}
+	else
+		/*
+		 * xlogreader failed, and no error was reported? we must be caught up.
+		 */
+		WalSndCaughtUp = true;
+
+	/* Update shared memory status */
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalSnd *walsnd = MyWalSnd;
+
+		SpinLockAcquire(&walsnd->mutex);
+		walsnd->sentPtr = logical_decoding_ctx->reader->ReadRecPtr;
+		SpinLockRelease(&walsnd->mutex);
+	}
+}
+
+/*
+ * The sender is caught up, so we can go away for shutdown processing
+ * to finish normally.  (This should only be called when the shutdown
+ * signal has been received from postmaster.)
+ *
+ * Note that if while doing this we determine that there's still more
+ * data to send, this function will return control to the caller.
+ */
+static void
+WalSndDone(WalSndSendData send_data)
+{
+	/* ... let's just be real sure we're caught up ... */
+	send_data();
+
+	if (WalSndCaughtUp && sentPtr == MyWalSnd->flush &&
+		!pq_is_send_pending())
+	{
+		/* Inform the standby that XLOG streaming is done */
+		EndCommand("COPY 0", DestRemote);
+		pq_flush();
+
+		proc_exit(0);
+	}
+}
+
+/*
  * Returns the latest point in WAL that has been safely flushed to disk, and
  * can be sent to the standby. This should only be called when in recovery,
  * ie. we're streaming to a cascaded standby.
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index a0b741b..71d8f04 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -27,6 +27,7 @@
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "replication/logical.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -124,6 +125,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, ProcSignalShmemSize());
 		size = add_size(size, CheckpointerShmemSize());
 		size = add_size(size, AutoVacuumShmemSize());
+		size = add_size(size, LogicalDecodingShmemSize());
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
 		size = add_size(size, BTreeShmemSize());
@@ -230,6 +232,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	ProcSignalShmemInit();
 	CheckpointerShmemInit();
 	AutoVacuumShmemInit();
+	LogicalDecodingShmemInit();
 	WalSndShmemInit();
 	WalRcvShmemInit();
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index c2f86ff..11aa1f5 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -51,6 +51,9 @@
 #include "access/xact.h"
 #include "access/twophase.h"
 #include "miscadmin.h"
+#include "replication/logical.h"
+#include "replication/walsender.h"
+#include "replication/walsender_private.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/spin.h"
@@ -1141,16 +1144,18 @@ TransactionIdIsActive(TransactionId xid)
  * GetOldestXmin() move backwards, with no consequences for data integrity.
  */
 TransactionId
-GetOldestXmin(bool allDbs, bool ignoreVacuum)
+GetOldestXmin(bool allDbs, bool ignoreVacuum, bool systable, bool alreadyLocked)
 {
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId result;
 	int			index;
+	volatile TransactionId logical_xmin = InvalidTransactionId;
 
 	/* Cannot look for individual databases during recovery */
 	Assert(allDbs || !RecoveryInProgress());
 
-	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	if (!alreadyLocked)
+		LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	/*
 	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
@@ -1197,6 +1202,10 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		}
 	}
 
+	/* fetch into volatile var while ProcArrayLock is held */
+	if (max_logical_slots > 0)
+		logical_xmin = LogicalDecodingCtl->xmin;
+
 	if (RecoveryInProgress())
 	{
 		/*
@@ -1205,7 +1214,8 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		 */
 		TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
 
-		LWLockRelease(ProcArrayLock);
+		if (!alreadyLocked)
+			LWLockRelease(ProcArrayLock);
 
 		if (TransactionIdIsNormal(kaxmin) &&
 			TransactionIdPrecedes(kaxmin, result))
@@ -1213,10 +1223,8 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 	}
 	else
 	{
-		/*
-		 * No other information needed, so release the lock immediately.
-		 */
-		LWLockRelease(ProcArrayLock);
+		if (!alreadyLocked)
+			LWLockRelease(ProcArrayLock);
 
 		/*
 		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
@@ -1237,6 +1245,15 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 			result = FirstNormalTransactionId;
 	}
 
+	/*
+	 * after locks are released and defer_cleanup_age has been applied, check
+	 * whether we need to back up further to make logical decoding possible.
+	 */
+	if (systable &&
+		TransactionIdIsValid(logical_xmin) &&
+		NormalTransactionIdPrecedes(logical_xmin, result))
+		result = logical_xmin;
+
 	return result;
 }
 
@@ -1290,7 +1307,9 @@ GetMaxSnapshotSubxidCount(void)
  *			older than this are known not running any more.
  *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
  *			running transactions, except those running LAZY VACUUM).  This is
- *			the same computation done by GetOldestXmin(true, true).
+ *			the same computation done by GetOldestXmin(true, true, ...).
+ *		RecentGlobalDataXmin: the global xmin for non-catalog tables
+ *			>= RecentGlobalXmin
  *
  * Note: this function should probably not be called with an argument that's
  * not statically allocated (see xip allocation below).
@@ -1306,6 +1325,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	volatile TransactionId logical_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -1483,8 +1503,14 @@ GetSnapshotData(Snapshot snapshot)
 			suboverflowed = true;
 	}
 
+
+	/* fetch into volatile var while ProcArrayLock is held */
+	if (max_logical_slots > 0)
+		logical_xmin = LogicalDecodingCtl->xmin;
+
 	if (!TransactionIdIsValid(MyPgXact->xmin))
 		MyPgXact->xmin = TransactionXmin = xmin;
+
 	LWLockRelease(ProcArrayLock);
 
 	/*
@@ -1499,6 +1525,17 @@ GetSnapshotData(Snapshot snapshot)
 	RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
 	if (!TransactionIdIsNormal(RecentGlobalXmin))
 		RecentGlobalXmin = FirstNormalTransactionId;
+
+	/* Non-catalog tables can be vacuumed if older than this xid */
+	RecentGlobalDataXmin = RecentGlobalXmin;
+
+	/*
+	 * peg the global xmin to the one required for logical decoding if required
+	 */
+	if (TransactionIdIsNormal(logical_xmin) &&
+		NormalTransactionIdPrecedes(logical_xmin, RecentGlobalXmin))
+		RecentGlobalXmin = logical_xmin;
+
 	RecentXmin = xmin;
 
 	snapshot->xmin = xmin;
@@ -1599,9 +1636,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
  * Similar to GetSnapshotData but returns more information. We include
  * all PGXACTs with an assigned TransactionId, even VACUUM processes.
  *
- * We acquire XidGenLock, but the caller is responsible for releasing it.
- * This ensures that no new XIDs enter the proc array until the caller has
- * WAL-logged this snapshot, and releases the lock.
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
  *
  * The returned data structure is statically allocated; caller should not
  * modify it, and must not assume it is valid past the next call.
@@ -1736,6 +1775,12 @@ GetRunningTransactionData(void)
 		}
 	}
 
+	/*
+	 * Its important *not* to track decoding tasks here because snapbuild.c
+	 * uses ->oldestRunningXid to manage its xmin. If it were to be included
+	 * here the initial value could never increase.
+	 */
+
 	CurrentRunningXacts->xcnt = count - subcount;
 	CurrentRunningXacts->subxcnt = subcount;
 	CurrentRunningXacts->subxid_overflow = suboverflowed;
@@ -1743,13 +1788,12 @@ GetRunningTransactionData(void)
 	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
 	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
 
-	/* We don't release XidGenLock here, the caller is responsible for that */
-	LWLockRelease(ProcArrayLock);
-
 	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
 	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
 	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
 
+	/* We don't release the locks here, the caller is responsible for that */
+
 	return CurrentRunningXacts;
 }
 
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 97da1a0..5f74c3e 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -879,8 +879,23 @@ LogStandbySnapshot(void)
 	 * record we write, because standby will open up when it sees this.
 	 */
 	running = GetRunningTransactionData();
+
+	/*
+	 * GetRunningTransactionData() acquired ProcArrayLock, we must release
+	 * it. We can do that before inserting the WAL record because
+	 * ProcArrayApplyRecoveryInfo can recheck the commit status using the
+	 * clog. If we're doing logical replication we can't do that though, so
+	 * hold the lock for a moment longer.
+	 */
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	recptr = LogCurrentRunningXacts(running);
 
+	/* Release lock if we kept it longer ... */
+	if (wal_level >= WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
 	LWLockRelease(XidGenLock);
 
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index bfe7d78..015970a 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -512,7 +512,7 @@ RegisterSnapshotInvalidation(Oid dbId, Oid relId)
  * Only the local caches are flushed; this does not transmit the message
  * to other backends.
  */
-static void
+void
 LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
 {
 	if (msg->id >= 0)
@@ -596,7 +596,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
  *		since that tells us we've lost some shared-inval messages and hence
  *		don't know what needs to be invalidated.
  */
-static void
+void
 InvalidateSystemCaches(void)
 {
 	int			i;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 44dd0d2..5d304ce 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1601,6 +1601,10 @@ RelationIdGetRelation(Oid relationId)
 		return rd;
 	}
 
+	/* up2date system relations, even during timetravel */
+	if (IsSystemRelationId(relationId))
+		SuspendDecodingSnapshots();
+
 	/*
 	 * no reldesc in the cache, so have RelationBuildDesc() build one and add
 	 * it.
@@ -1608,6 +1612,10 @@ RelationIdGetRelation(Oid relationId)
 	rd = RelationBuildDesc(relationId, true);
 	if (RelationIsValid(rd))
 		RelationIncrementReferenceCount(rd);
+
+	if (IsSystemRelationId(relationId))
+		UnSuspendDecodingSnapshots();
+
 	return rd;
 }
 
@@ -1729,6 +1737,10 @@ RelationReloadIndexInfo(Relation relation)
 		return;
 	}
 
+	/* up2date system relations, even during timetravel */
+	if (IsSystemRelation(relation))
+		SuspendDecodingSnapshots();
+
 	/*
 	 * Read the pg_class row
 	 *
@@ -1796,6 +1808,9 @@ RelationReloadIndexInfo(Relation relation)
 
 	/* Okay, now it's valid again */
 	relation->rd_isvalid = true;
+
+	if (IsSystemRelation(relation))
+		UnSuspendDecodingSnapshots();
 }
 
 /*
@@ -1977,6 +1992,10 @@ RelationClearRelation(Relation relation, bool rebuild)
 		bool		keep_tupdesc;
 		bool		keep_rules;
 
+		/* up2date system relations, even during timetravel */
+		if (IsSystemRelation(relation))
+			SuspendDecodingSnapshots();
+
 		/* Build temporary entry, but don't link it into hashtable */
 		newrel = RelationBuildDesc(save_relid, false);
 		if (newrel == NULL)
@@ -2046,6 +2065,9 @@ RelationClearRelation(Relation relation, bool rebuild)
 
 		/* And now we can throw away the temporary entry */
 		RelationDestroyRelation(newrel);
+
+		if (IsSystemRelation(relation))
+			UnSuspendDecodingSnapshots();
 	}
 }
 
@@ -3551,7 +3573,10 @@ RelationGetIndexList(Relation relation)
 					Form_pg_attribute attr;
 					/* internal column, like oid */
 					if (attno <= 0)
-						continue;
+					{
+						found = false;
+						break;
+					}
 
 					attr = relation->rd_att->attrs[attno - 1];
 					if (!attr->attnotnull)
@@ -3839,17 +3864,26 @@ RelationGetIndexPredicate(Relation relation)
  * be bms_free'd when not needed anymore.
  */
 Bitmapset *
-RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
+RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
 {
 	Bitmapset  *indexattrs;
-	Bitmapset  *uindexattrs;
+	Bitmapset  *uindexattrs; /* unique keys */
+	Bitmapset  *cindexattrs; /* best candidate key */
 	List	   *indexoidlist;
 	ListCell   *l;
 	MemoryContext oldcxt;
 
 	/* Quick exit if we already computed the result. */
 	if (relation->rd_indexattr != NULL)
-		return bms_copy(keyAttrs ? relation->rd_keyattr : relation->rd_indexattr);
+		switch(attrKind)
+		{
+			case INDEX_ATTR_BITMAP_CANDIDATE_KEY:
+				return bms_copy(relation->rd_ckeyattr);
+			case INDEX_ATTR_BITMAP_KEY:
+				return bms_copy(relation->rd_keyattr);
+			case INDEX_ATTR_BITMAP_ALL:
+				return bms_copy(relation->rd_indexattr);
+		}
 
 	/* Fast path if definitely no indexes */
 	if (!RelationGetForm(relation)->relhasindex)
@@ -3876,13 +3910,16 @@ RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 	 */
 	indexattrs = NULL;
 	uindexattrs = NULL;
+	cindexattrs = NULL;
 	foreach(l, indexoidlist)
 	{
 		Oid			indexOid = lfirst_oid(l);
 		Relation	indexDesc;
 		IndexInfo  *indexInfo;
 		int			i;
-		bool		isKey;
+		bool		isCKey;/* candidate or primary key */
+		bool		isKey;/* key member */
+
 
 		indexDesc = index_open(indexOid, AccessShareLock);
 
@@ -3894,6 +3931,8 @@ RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 			indexInfo->ii_Expressions == NIL &&
 			indexInfo->ii_Predicate == NIL;
 
+		isCKey = indexOid == relation->rd_primary;
+
 		/* Collect simple attribute references */
 		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 		{
@@ -3903,6 +3942,11 @@ RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 			{
 				indexattrs = bms_add_member(indexattrs,
 							   attrnum - FirstLowInvalidHeapAttributeNumber);
+
+				if (isCKey)
+					cindexattrs = bms_add_member(cindexattrs,
+												 attrnum - FirstLowInvalidHeapAttributeNumber);
+
 				if (isKey)
 					uindexattrs = bms_add_member(uindexattrs,
 							   attrnum - FirstLowInvalidHeapAttributeNumber);
@@ -3924,10 +3968,21 @@ RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indexattr = bms_copy(indexattrs);
 	relation->rd_keyattr = bms_copy(uindexattrs);
+	relation->rd_ckeyattr = bms_copy(cindexattrs);
 	MemoryContextSwitchTo(oldcxt);
 
 	/* We return our original working copy for caller to play with */
-	return keyAttrs ? uindexattrs : indexattrs;
+	switch(attrKind)
+	{
+		case INDEX_ATTR_BITMAP_CANDIDATE_KEY:
+			return cindexattrs;
+		case INDEX_ATTR_BITMAP_KEY:
+			return uindexattrs;
+		case INDEX_ATTR_BITMAP_ALL:
+			return indexattrs;
+		default:
+			elog(ERROR, "unknown attrKind %u", attrKind);
+	}
 }
 
 /*
@@ -4902,3 +4957,49 @@ unlink_initfile(const char *initfilename)
 			elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
 	}
 }
+
+bool
+RelationIsDoingTimetravelInternal(Relation relation)
+{
+	Assert(wal_level >= WAL_LEVEL_LOGICAL);
+
+	if (!RelationNeedsWAL(relation))
+		return false;
+
+	/*
+	 * XXX: Doing this test instead of using IsSystemNamespace has the
+	 * advantage of classifying a catalog relation's toast tables as a
+	 * timetravel relation as well. This is safe since even a oid wraparound
+	 * will preserve this property (c.f. GetNewObjectId()).
+	 */
+	if (IsSystemRelation(relation))
+		return true;
+
+	/*
+	 * Also log relevant data if we want the table to behave as a catalog
+	 * table, although its not a system provided one.
+	 * XXX: we need to make sure both the relation and its toast relation have
+	 * the flag set!
+	 */
+	if (RelationIsTreatedAsCatalogTable(relation))
+	    return true;
+
+	return false;
+}
+
+bool
+RelationIsLogicallyLoggedInternal(Relation relation)
+{
+	Assert(wal_level >= WAL_LEVEL_LOGICAL);
+	if (!RelationNeedsWAL(relation))
+		return false;
+	/*
+	 * XXX: In addition to the above comment, we could decide to always log
+	 * data even for real system catalogs, although the benefits of that seem
+	 * unclear.
+	 */
+	if (IsSystemRelation(relation))
+		return false;
+
+	return true;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3107f9c..4a81018 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -57,6 +57,7 @@
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
 #include "postmaster/walwriter.h"
+#include "replication/logical.h"
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
@@ -2072,6 +2073,17 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		/* see max_connections */
+		{"max_logical_slots", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Sets the maximum number of simultaneously defined WAL decoding slots."),
+			NULL
+		},
+		&max_logical_slots,
+		0, 0, MAX_BACKENDS /*?*/,
+		NULL, NULL, NULL
+	},
+
+	{
 		{"wal_sender_timeout", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum time to wait for WAL replication."),
 			NULL,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d69a02b..b04291c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -161,7 +161,7 @@
 
 # - Settings -
 
-#wal_level = minimal			# minimal, archive, or hot_standby
+#wal_level = minimal			# minimal, archive, logical or hot_standby
 					# (change requires restart)
 #fsync = on				# turns forced synchronization on or off
 #synchronous_commit = on		# synchronization level;
@@ -208,11 +208,18 @@
 
 # Set these on the master and on any standby that will send replication data.
 
-#max_wal_senders = 0		# max number of walsender processes
+#max_wal_senders = 0		# max number of walsender processes, including
+				# both physical and logical replication senders.
 				# (change requires restart)
 #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
 #wal_sender_timeout = 60s	# in milliseconds; 0 disables
 
+#max_logical_slots = 0		# max number of logical replication sender
+				# and receiver processes. Logical senders
+				# (but not receivers) also consume a
+				# max_wal_senders slot.
+				# (change requires restart)
+
 # - Master Server -
 
 # These settings are ignored on a standby server.
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 584d70c..f63bafa 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -69,7 +69,7 @@
  */
 static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
 static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
-static SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
+SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
 
 /* Pointers to valid snapshots */
 static Snapshot CurrentSnapshot = NULL;
@@ -86,13 +86,14 @@ static bool CatalogSnapshotStale = true;
  * for the convenience of TransactionIdIsInProgress: even in bootstrap
  * mode, we don't want it to say that BootstrapTransactionId is in progress.
  *
- * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
+ * RecentGlobal(Data)?Xmin is initialized to InvalidTransactionId, to ensure that no
  * one tries to use a stale value.	Readers should ensure that it has been set
  * to something else before using it.
  */
 TransactionId TransactionXmin = FirstNormalTransactionId;
 TransactionId RecentXmin = FirstNormalTransactionId;
 TransactionId RecentGlobalXmin = InvalidTransactionId;
+TransactionId RecentGlobalDataXmin = InvalidTransactionId;
 
 /*
  * Elements of the active snapshot stack.
@@ -796,7 +797,7 @@ AtEOXact_Snapshot(bool isCommit)
  *		Returns the token (the file name) that can be used to import this
  *		snapshot.
  */
-static char *
+char *
 ExportSnapshot(Snapshot snapshot)
 {
 	TransactionId topXid;
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index ed66c49..28ce805 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -62,6 +62,8 @@
 #include "access/xact.h"
 #include "storage/bufmgr.h"
 #include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/combocid.h"
 #include "utils/tqual.h"
 
 
@@ -70,9 +72,17 @@ SnapshotData SnapshotSelfData = {HeapTupleSatisfiesSelf};
 SnapshotData SnapshotAnyData = {HeapTupleSatisfiesAny};
 SnapshotData SnapshotToastData = {HeapTupleSatisfiesToast};
 
+static Snapshot TimetravelSnapshot;
+/* (table, ctid) => (cmin, cmax) mapping during timetravel */
+static HTAB *tuplecid_data = NULL;
+static int timetravel_suspended = 0;
+
+
 /* local functions */
 static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
-
+static bool FailsSatisfies(HeapTuple htup, Snapshot snapshot, Buffer buffer);
+static bool RedirectSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
+								 Buffer buffer);
 
 /*
  * SetHintBits()
@@ -1490,3 +1500,261 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
 	 */
 	return true;
 }
+
+/*
+ * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+	return bsearch(&xid, xip, num,
+	               sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * See the comments for HeapTupleSatisfiesMVCC for the semantics this function
+ * obeys.
+ *
+ * Only usable on tuples from catalog tables!
+ *
+ * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support
+ * reading catalog pages which couldn't have been created in an older version.
+ *
+ * We don't set any hint bits in here as it seems unlikely to be beneficial as
+ * those should already be set by normal access and it seems to be too
+ * dangerous to do so as the semantics of doing so during timetravel are more
+ * complicated than when dealing "only" with the present.
+ */
+bool
+HeapTupleSatisfiesMVCCDuringDecoding(HeapTuple htup, Snapshot snapshot,
+                                     Buffer buffer)
+{
+	HeapTupleHeader tuple = htup->t_data;
+	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple);
+
+	Assert(ItemPointerIsValid(&htup->t_self));
+	Assert(htup->t_tableOid != InvalidOid);
+
+	/* inserting transaction aborted */
+	if (tuple->t_infomask & HEAP_XMIN_INVALID)
+	{
+		Assert(!TransactionIdDidCommit(xmin));
+		return false;
+	}
+    /* check if its one of our txids, toplevel is also in there */
+	else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt))
+	{
+		CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple);
+		CommandId cmax = InvalidCommandId;
+
+		/*
+		 * If another transaction deleted this tuple or if cmin/cmax is stored
+		 * in a combocid we need to to lookup the actual values externally. We
+		 * need to do so in the deleted case because the deletion will have
+		 * overwritten the cmin value when setting cmax (c.f. combocid.c).
+		 */
+		if ((!(tuple->t_infomask & HEAP_XMAX_INVALID) &&
+			 !TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) ||
+			tuple->t_infomask & HEAP_COMBOCID
+			)
+		{
+			bool resolved;
+
+			resolved = ResolveCminCmaxDuringDecoding(tuplecid_data, htup,
+													 buffer, &cmin, &cmax);
+
+			if (!resolved)
+				elog(ERROR, "could not resolve cmin/cmax of catalog tuple");
+		}
+
+		Assert(cmin != InvalidCommandId);
+
+		if (cmin >= snapshot->curcid)
+			return false;	/* inserted after scan started */
+	}
+	/* committed before our xmin horizon. Do a normal visibility check. */
+	else if (TransactionIdPrecedes(xmin, snapshot->xmin))
+	{
+		Assert(!(tuple->t_infomask & HEAP_XMIN_COMMITTED &&
+				 !TransactionIdDidCommit(xmin)));
+
+		/* check for hint bit first, consult clog afterwards */
+		if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED) &&
+			!TransactionIdDidCommit(xmin))
+			return false;
+	}
+	/* beyond our xmax horizon, i.e. invisible */
+	else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax))
+	{
+		return false;
+	}
+	/* check if it's a committed transaction in [xmin, xmax) */
+	else if(TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt))
+	{
+	}
+	/*
+	 * none of the above, i.e. between [xmin, xmax) but hasn't
+	 * committed. I.e. invisible.
+	 */
+	else
+	{
+		return false;
+	}
+
+	/* at this point we know xmin is visible, go on to check xmax */
+
+	/* why should those be in catalog tables? */
+	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+
+	/* xid invalid or aborted */
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		return true;
+	/* locked tuples are always visible */
+	else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		return true;
+    /* check if its one of our txids, toplevel is also in there */
+	else if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt))
+	{
+		CommandId cmin;
+		CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple);
+
+		/* Lookup actual cmin/cmax values */
+		if (tuple->t_infomask & HEAP_COMBOCID)
+		{
+			bool resolved;
+
+			resolved = ResolveCminCmaxDuringDecoding(tuplecid_data, htup,
+													 buffer, &cmin, &cmax);
+
+			if (!resolved)
+				elog(ERROR, "could not resolve combocid to cmax");
+		}
+
+		Assert(cmax != InvalidCommandId);
+
+		if (cmax >= snapshot->curcid)
+			return true;	/* deleted after scan started */
+		else
+			return false;	/* deleted before scan started */
+	}
+	/* below xmin horizon, normal transaction state is valid */
+	else if (TransactionIdPrecedes(xmax, snapshot->xmin))
+	{
+		Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED &&
+				 !TransactionIdDidCommit(xmax)));
+
+		/* check hint bit first */
+		if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
+			return false;
+
+		/* check clog */
+		return !TransactionIdDidCommit(xmax);
+	}
+	/* above xmax horizon, we cannot possibly see the deleting transaction */
+	else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax))
+		return true;
+	/* xmax is between [xmin, xmax), check known committed array */
+	else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt))
+		return false;
+	/* xmax is between [xmin, xmax), but known not to have committed yet */
+	else
+		return true;
+}
+
+/*
+ * Setup a snapshot that replaces normal catalog snapshots that allows catalog
+ * access to behave just like it did at a certain point in the past.
+ *
+ * Needed for after-the-fact WAL decoding.
+ */
+void
+SetupDecodingSnapshots(Snapshot timetravel_snapshot, HTAB *tuplecids)
+{
+	/* prevent recursively setting up decoding snapshots */
+	Assert(CatalogSnapshotData.satisfies != RedirectSatisfiesMVCC);
+
+	CatalogSnapshotData.satisfies = RedirectSatisfiesMVCC;
+	/* make sure normal snapshots aren't used*/
+	SnapshotSelfData.satisfies = FailsSatisfies;
+	SnapshotAnyData.satisfies = FailsSatisfies;
+	SnapshotToastData.satisfies = FailsSatisfies;
+	/* don't overwrite SnapshotToastData, we want that to behave normally */
+
+	/* setup the timetravel snapshot */
+	TimetravelSnapshot = timetravel_snapshot;
+
+	/* setup (cmin, cmax) lookup hash */
+	tuplecid_data = tuplecids;
+
+	timetravel_suspended = 0;
+}
+
+
+/*
+ * Make catalog snapshots behave normally again.
+ */
+void
+RevertFromDecodingSnapshots(void)
+{
+	Assert(timetravel_suspended == 0);
+
+	TimetravelSnapshot = NULL;
+	tuplecid_data = NULL;
+
+	/* rally to restore sanity and/or boredom */
+	CatalogSnapshotData.satisfies = HeapTupleSatisfiesMVCC;
+	SnapshotSelfData.satisfies = HeapTupleSatisfiesSelf;
+	SnapshotAnyData.satisfies = HeapTupleSatisfiesAny;
+	SnapshotToastData.satisfies = HeapTupleSatisfiesToast;
+	timetravel_suspended = 0;
+}
+
+/*
+ * Disable catalog snapshot timetravel and perform old-fashioned access but
+ * make re-enabling cheap.. This is useful for accessing catalog entries which
+ * must stay up2date like the pg_class entries of system relations.
+ *
+ * Can be called several times in a nested fashion since several of it's
+ * callers suspend timetravel access on several code levels.
+ */
+void
+SuspendDecodingSnapshots(void)
+{
+	timetravel_suspended++;
+}
+
+/*
+ * Enable timetravel again, After SuspendDecodingSnapshots it.
+ */
+void
+UnSuspendDecodingSnapshots(void)
+{
+	Assert(timetravel_suspended > 0);
+	timetravel_suspended--;
+}
+
+/*
+ * Error out if a normal snapshot is used. That is neither legal nor expected
+ * during timetravel, so this is just extra assurance.
+ */
+static bool
+FailsSatisfies(HeapTuple htup, Snapshot snapshot, Buffer buffer)
+{
+	elog(ERROR, "Normal snapshots cannot be used during timetravel access.");
+	return false;
+}
+
+
+/*
+ * Call the replacement SatisifiesMVCC with the required Snapshot data.
+ */
+static bool
+RedirectSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer)
+{
+	Assert(TimetravelSnapshot != NULL);
+	if (timetravel_suspended > 0)
+		return HeapTupleSatisfiesMVCC(htup, snapshot, buffer);
+	return HeapTupleSatisfiesMVCCDuringDecoding(htup, TimetravelSnapshot,
+	                                            buffer);
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index f66f530..a887035 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -193,7 +193,9 @@ const char *subdirs[] = {
 	"base/1",
 	"pg_tblspc",
 	"pg_stat",
-	"pg_stat_tmp"
+	"pg_stat_tmp",
+	"pg_llog",
+	"pg_llog/snapshots"
 };
 
 
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index fde483a..8c6cf24 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -77,6 +77,8 @@ wal_level_str(WalLevel wal_level)
 			return "archive";
 		case WAL_LEVEL_HOT_STANDBY:
 			return "hot_standby";
+		case WAL_LEVEL_LOGICAL:
+			return "logical";
 	}
 	return _("unrecognized wal_level");
 }
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 4381778..42f3e6b 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -55,6 +55,18 @@
 #define XLOG_HEAP2_VISIBLE		0x40
 #define XLOG_HEAP2_MULTI_INSERT 0x50
 #define XLOG_HEAP2_LOCK_UPDATED 0x60
+#define XLOG_HEAP2_NEW_CID		0x70
+
+/*
+ * xl_heap_* ->flag values
+ */
+/* PD_ALL_VISIBLE was cleared */
+#define XLOG_HEAP_ALL_VISIBLE_CLEARED		(1<<0)
+/* PD_ALL_VISIBLE was cleared in the 2nd page */
+#define XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED	(1<<1)
+#define XLOG_HEAP_CONTAINS_OLD_TUPLE		(1<<2)
+#define XLOG_HEAP_CONTAINS_OLD_KEY			(1<<3)
+#define XLOG_HEAP_CONTAINS_NEW_TUPLE		(1<<4)
 
 /*
  * All what we need to find changed tuple
@@ -78,10 +90,10 @@ typedef struct xl_heap_delete
 	xl_heaptid	target;			/* deleted tuple id */
 	TransactionId xmax;			/* xmax of the deleted tuple */
 	uint8		infobits_set;	/* infomask bits */
-	bool		all_visible_cleared;	/* PD_ALL_VISIBLE was cleared */
+	uint8		flags;
 } xl_heap_delete;
 
-#define SizeOfHeapDelete	(offsetof(xl_heap_delete, all_visible_cleared) + sizeof(bool))
+#define SizeOfHeapDelete	(offsetof(xl_heap_delete, flags) + sizeof(uint8))
 
 /*
  * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted
@@ -100,15 +112,23 @@ typedef struct xl_heap_header
 
 #define SizeOfHeapHeader	(offsetof(xl_heap_header, t_hoff) + sizeof(uint8))
 
+typedef struct xl_heap_header_len
+{
+	uint16      t_len;
+	xl_heap_header header;
+} xl_heap_header_len;
+
+#define SizeOfHeapHeaderLen	(offsetof(xl_heap_header_len, header) + SizeOfHeapHeader)
+
 /* This is what we need to know about insert */
 typedef struct xl_heap_insert
 {
 	xl_heaptid	target;			/* inserted tuple id */
-	bool		all_visible_cleared;	/* PD_ALL_VISIBLE was cleared */
+	uint8		flags;
 	/* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */
 } xl_heap_insert;
 
-#define SizeOfHeapInsert	(offsetof(xl_heap_insert, all_visible_cleared) + sizeof(bool))
+#define SizeOfHeapInsert	(offsetof(xl_heap_insert, flags) + sizeof(uint8))
 
 /*
  * This is what we need to know about a multi-insert. The record consists of
@@ -120,7 +140,7 @@ typedef struct xl_heap_multi_insert
 {
 	RelFileNode node;
 	BlockNumber blkno;
-	bool		all_visible_cleared;
+	uint8		flags;
 	uint16		ntuples;
 	OffsetNumber offsets[1];
 
@@ -147,13 +167,12 @@ typedef struct xl_heap_update
 	TransactionId old_xmax;		/* xmax of the old tuple */
 	TransactionId new_xmax;		/* xmax of the new tuple */
 	ItemPointerData newtid;		/* new inserted tuple id */
-	uint8		old_infobits_set;		/* infomask bits to set on old tuple */
-	bool		all_visible_cleared;	/* PD_ALL_VISIBLE was cleared */
-	bool		new_all_visible_cleared;		/* same for the page of newtid */
+	uint8		old_infobits_set;	/* infomask bits to set on old tuple */
+	uint8		flags;
 	/* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */
 } xl_heap_update;
 
-#define SizeOfHeapUpdate	(offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool))
+#define SizeOfHeapUpdate	(offsetof(xl_heap_update, flags) + sizeof(uint8))
 
 /*
  * This is what we need to know about vacuum page cleanup/redirect
@@ -261,6 +280,28 @@ typedef struct xl_heap_visible
 
 #define SizeOfHeapVisible (offsetof(xl_heap_visible, cutoff_xid) + sizeof(TransactionId))
 
+typedef struct xl_heap_new_cid
+{
+	/*
+	 * store toplevel xid so we don't have to merge cids from different
+	 * transactions
+	 */
+	TransactionId top_xid;
+	CommandId cmin;
+	CommandId cmax;
+	/*
+	 * don't really need the combocid but the padding makes it free and its
+	 * useful for debugging.
+	 */
+	CommandId combocid;
+	/*
+	 * Store the relfilenode/ctid pair to facilitate lookups.
+	 */
+	xl_heaptid target;
+} xl_heap_new_cid;
+
+#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid)
+
 extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 									   TransactionId *latestRemovedXid);
 
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 23a41fd..8452ec5 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -63,6 +63,11 @@
 	(AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
 	(int32) ((id1) - (id2)) < 0)
 
+/* compare two XIDs already known to be normal; this is a macro for speed */
+#define NormalTransactionIdFollows(id1, id2) \
+	(AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
+	(int32) ((id1) - (id2)) > 0)
+
 /* ----------
  *		Object ID (OID) zero is InvalidOid.
  *
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 835f6ac..96502ce 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -215,6 +215,7 @@ extern TransactionId GetCurrentTransactionId(void);
 extern TransactionId GetCurrentTransactionIdIfAny(void);
 extern TransactionId GetStableLatestTransactionId(void);
 extern SubTransactionId GetCurrentSubTransactionId(void);
+extern void MarkCurrentTransactionIdLoggedIfAny(void);
 extern bool SubTransactionIsActive(SubTransactionId subxid);
 extern CommandId GetCurrentCommandId(bool used);
 extern TimestampTz GetCurrentTransactionStartTimestamp(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 002862c..7415a26 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -197,7 +197,8 @@ typedef enum WalLevel
 {
 	WAL_LEVEL_MINIMAL = 0,
 	WAL_LEVEL_ARCHIVE,
-	WAL_LEVEL_HOT_STANDBY
+	WAL_LEVEL_HOT_STANDBY,
+	WAL_LEVEL_LOGICAL
 } WalLevel;
 extern int	wal_level;
 
@@ -210,9 +211,12 @@ extern int	wal_level;
  */
 #define XLogIsNeeded() (wal_level >= WAL_LEVEL_ARCHIVE)
 
-/* Do we need to WAL-log information required only for Hot Standby? */
+/* Do we need to WAL-log information required only for Hot Standby and logical replication? */
 #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_HOT_STANDBY)
 
+/* Do we need to WAL-log information required only for logical replication? */
+#define XLogLogicalInfoActive() (wal_level >= WAL_LEVEL_LOGICAL)
+
 #ifdef WAL_DEBUG
 extern bool XLOG_DEBUG;
 #endif
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 3829ce2..fdc8cc2 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -19,6 +19,7 @@
 #ifndef XLOGREADER_H
 #define XLOGREADER_H
 
+#include "access/xlog.h"
 #include "access/xlog_internal.h"
 
 typedef struct XLogReaderState XLogReaderState;
@@ -108,10 +109,20 @@ struct XLogReaderState
 	char	   *errormsg_buf;
 };
 
-/* Get a new XLogReader */
+
 extern XLogReaderState *XLogReaderAllocate(XLogPageReadCB pagereadfunc,
 				   void *private_data);
 
+
+typedef struct XLogRecordBuffer
+{
+	XLogRecPtr origptr;
+	XLogRecPtr endptr;
+	XLogRecord record;
+	char *record_data;
+} XLogRecordBuffer;
+
+
 /* Free an XLogReader */
 extern void XLogReaderFree(XLogReaderState *state);
 
diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h
index 44b6f38..a96ed69 100644
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@ -23,6 +23,7 @@ extern ForkNumber forkname_to_number(char *forkName);
 extern char *GetDatabasePath(Oid dbNode, Oid spcNode);
 
 
+extern bool IsSystemRelationId(Oid relid);
 extern bool IsSystemRelation(Relation relation);
 extern bool IsToastRelation(Relation relation);
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index f03dd0b..cf9c143 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2621,6 +2621,8 @@ DATA(insert OID = 2022 (  pg_stat_get_activity			PGNSP PGUID 12 1 100 0 0 f f f
 DESCR("statistics: information about currently active backends");
 DATA(insert OID = 3099 (  pg_stat_get_wal_senders	PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{23,25,25,25,25,25,23,25}" "{o,o,o,o,o,o,o,o}" "{pid,state,sent_location,write_location,flush_location,replay_location,sync_priority,sync_state}" _null_ pg_stat_get_wal_senders _null_ _null_ _null_ ));
 DESCR("statistics: information about currently active replication");
+DATA(insert OID = 3457 (  pg_stat_get_logical_decoding_slots	PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{25,25,26,16,28,25}" "{o,o,o,o,o,o}" "{slot_name,plugin,database,active,xmin,restart_decoding_lsn}" _null_ pg_stat_get_logical_decoding_slots _null_ _null_ _null_ ));
+DESCR("statistics: information about logical replication slots currently in use");
 DATA(insert OID = 2026 (  pg_backend_pid				PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 23 "" _null_ _null_ _null_ _null_ pg_backend_pid _null_ _null_ _null_ ));
 DESCR("statistics: current backend PID");
 DATA(insert OID = 1937 (  pg_stat_get_backend_pid		PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 23 "23" _null_ _null_ _null_ _null_ pg_stat_get_backend_pid _null_ _null_ _null_ ));
@@ -4725,6 +4727,10 @@ DESCR("SP-GiST support for quad tree over range");
 DATA(insert OID = 3473 (  spg_range_quad_leaf_consistent	PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2281 2281" _null_ _null_ _null_ _null_  spg_range_quad_leaf_consistent _null_ _null_ _null_ ));
 DESCR("SP-GiST support for quad tree over range");
 
+DATA(insert OID = 3779 (  init_logical_replication PGNSP PGUID 12 1 0 0 0 f f f f f f v 2 0 2249 "19 19" "{19,19,25,25}" "{i,i,o,o}" "{slotname,plugin,slotname,xlog_position}" _null_ init_logical_replication _null_ _null_ _null_ ));
+DESCR("set up a logical replication slot");
+DATA(insert OID = 3780 (  stop_logical_replication PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 23 "19" _null_ _null_ _null_ _null_ stop_logical_replication _null_ _null_ _null_ ));
+DESCR("stop logical replication");
 
 /* event triggers */
 DATA(insert OID = 3566 (  pg_event_trigger_dropped_objects		PGNSP PGUID 12 10 100 0 0 f f f f t t s 0 0 2249 "" "{26,26,23,25,25,25,25}" "{o,o,o,o,o,o,o}" "{classid, objid, objsubid, object_type, schema_name, object_name, object_identity}" _null_ pg_event_trigger_dropped_objects _null_ _null_ _null_ ));
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 08bec25..66b8263 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -156,7 +156,7 @@ extern void vac_update_relstats(Relation relation,
 					TransactionId frozenxid,
 					MultiXactId minmulti);
 extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
-					  bool sharedRel,
+					  bool sharedRel, bool catalogRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *freezeTableLimit,
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 78368c6..360f98c 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -409,6 +409,9 @@ typedef enum NodeTag
 	T_IdentifySystemCmd,
 	T_BaseBackupCmd,
 	T_StartReplicationCmd,
+	T_InitLogicalReplicationCmd,
+	T_StartLogicalReplicationCmd,
+	T_FreeLogicalReplicationCmd,
 	T_TimeLineHistoryCmd,
 
 	/*
diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h
index 85b4544..3da8d40 100644
--- a/src/include/nodes/replnodes.h
+++ b/src/include/nodes/replnodes.h
@@ -52,6 +52,41 @@ typedef struct StartReplicationCmd
 
 
 /* ----------------------
+ *		INIT_LOGICAL_REPLICATION command
+ * ----------------------
+ */
+typedef struct InitLogicalReplicationCmd
+{
+	NodeTag		type;
+	char       *name;
+	char       *plugin;
+} InitLogicalReplicationCmd;
+
+
+/* ----------------------
+ *		START_LOGICAL_REPLICATION command
+ * ----------------------
+ */
+typedef struct StartLogicalReplicationCmd
+{
+	NodeTag		type;
+	char       *name;
+	XLogRecPtr	startpoint;
+	List       *options;
+} StartLogicalReplicationCmd;
+
+/* ----------------------
+ *		FREE_LOGICAL_REPLICATION command
+ * ----------------------
+ */
+typedef struct FreeLogicalReplicationCmd
+{
+	NodeTag		type;
+	char       *name;
+} FreeLogicalReplicationCmd;
+
+
+/* ----------------------
  *		TIMELINE_HISTORY command
  * ----------------------
  */
diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h
new file mode 100644
index 0000000..dd3f2ca
--- /dev/null
+++ b/src/include/replication/decode.h
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ * decode.h
+ *	   PostgreSQL WAL to logical transformation
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DECODE_H
+#define DECODE_H
+
+#include "access/xlogreader.h"
+#include "replication/reorderbuffer.h"
+#include "replication/logical.h"
+
+void DecodeRecordIntoReorderBuffer(LogicalDecodingContext *ctx,
+							  XLogRecordBuffer *buf);
+
+#endif
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
new file mode 100644
index 0000000..971180b
--- /dev/null
+++ b/src/include/replication/logical.h
@@ -0,0 +1,198 @@
+/*-------------------------------------------------------------------------
+ * logical.h
+ *	   PostgreSQL WAL to logical transformation
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICAL_H
+#define LOGICAL_H
+
+#include "access/xlog.h"
+#include "access/xlogreader.h"
+#include "replication/output_plugin.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+/*
+ * Shared memory state of a single logical decoding slot
+ */
+typedef struct LogicalDecodingSlot
+{
+	/* lock, on same cacheline as effective_xmin */
+	slock_t		mutex;
+
+	/* on-disk xmin, updated first */
+	TransactionId xmin;
+
+	/* in-memory xmin, updated after syncing to disk */
+	TransactionId effective_xmin;
+
+	/* is this slot defined */
+	bool		in_use;
+
+	/* is somebody streaming out changes for this slot */
+	bool		active;
+
+	/* have we been aborted while ->active */
+	bool		aborted;
+
+	/* ----
+	 * If we shutdown, crash, whatever where do we have to restart decoding
+	 * from to
+	 * a) find a valid & ready snapshot
+	 * b) the complete content for all in-progress xacts
+	 * ----
+	 */
+	XLogRecPtr	restart_decoding;
+
+	/*
+	 * Last location we know the client has confirmed to have safely received
+	 * data to. No earlier data can be decoded after a restart/crash.
+	 */
+	XLogRecPtr	confirmed_flush;
+
+	/* ----
+	 * When the client has confirmed flushes >= candidate_xmin_after we can
+	 * a) advance the pegged xmin
+	 * b) advance restart_decoding_from so we have to read/keep less WAL
+	 * ----
+	 */
+	XLogRecPtr	candidate_lsn;
+	TransactionId candidate_xmin;
+	XLogRecPtr	candidate_restart_decoding;
+
+	/* database the slot is active on */
+	Oid			database;
+
+	/* slot identifier */
+	NameData	name;
+
+	/* plugin name */
+	NameData	plugin;
+} LogicalDecodingSlot;
+
+/*
+ * Shared memory control area for all of logical decoding
+ */
+typedef struct LogicalDecodingCtlData
+{
+	/*
+	 * Xmin across all logical slots.
+	 *
+	 * Protected by ProcArrayLock.
+	 */
+	TransactionId xmin;
+
+	LogicalDecodingSlot logical_slots[FLEXIBLE_ARRAY_MEMBER];
+} LogicalDecodingCtlData;
+
+/*
+ * Pointers to shared memory
+ */
+extern LogicalDecodingCtlData *LogicalDecodingCtl;
+extern LogicalDecodingSlot *MyLogicalDecodingSlot;
+
+struct LogicalDecodingContext;
+
+typedef void (*LogicalOutputPluginWriterWrite) (
+										   struct LogicalDecodingContext *lr,
+															XLogRecPtr Ptr,
+															TransactionId xid
+);
+
+typedef LogicalOutputPluginWriterWrite LogicalOutputPluginWriterPrepareWrite;
+
+/*
+ * Output plugin callbacks
+ */
+typedef struct OutputPluginCallbacks
+{
+	LogicalDecodeInitCB init_cb;
+	LogicalDecodeBeginCB begin_cb;
+	LogicalDecodeChangeCB change_cb;
+	LogicalDecodeCommitCB commit_cb;
+	LogicalDecodeCleanupCB cleanup_cb;
+} OutputPluginCallbacks;
+
+typedef struct LogicalDecodingContext
+{
+	struct XLogReaderState *reader;
+	struct LogicalDecodingSlot *slot;
+	struct ReorderBuffer *reorder;
+	struct SnapBuild *snapshot_builder;
+
+	struct OutputPluginCallbacks callbacks;
+
+	bool		stop_after_consistent;
+
+	/*
+	 * User specified options
+	 */
+	List	   *output_plugin_options;
+
+	/*
+	 * User-Provided callback for writing/streaming out data.
+	 */
+	LogicalOutputPluginWriterPrepareWrite prepare_write;
+	LogicalOutputPluginWriterWrite write;
+
+	/*
+	 * Output buffer.
+	 */
+	StringInfo	out;
+
+	/*
+	 * Private data pointer for the creator of the logical decoding context.
+	 */
+	void	   *owner_private;
+
+	/*
+	 * Private data pointer of the output plugin.
+	 */
+	void	   *output_plugin_private;
+
+	/*
+	 * Private data pointer for the data writer.
+	 */
+	void	   *output_writer_private;
+} LogicalDecodingContext;
+
+/* GUCs */
+extern PGDLLIMPORT int max_logical_slots;
+
+extern Size LogicalDecodingShmemSize(void);
+extern void LogicalDecodingShmemInit(void);
+
+extern void LogicalDecodingAcquireFreeSlot(const char *name, const char *plugin);
+extern void LogicalDecodingReleaseSlot(void);
+extern void LogicalDecodingReAcquireSlot(const char *name);
+extern void LogicalDecodingFreeSlot(const char *name);
+
+extern void ComputeLogicalXmin(void);
+
+/* change logical xmin */
+extern void IncreaseLogicalXminForSlot(XLogRecPtr lsn, TransactionId xmin);
+
+/* change recovery restart location */
+extern void IncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn);
+
+extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn);
+
+extern void CheckLogicalReplicationRequirements(void);
+
+extern void StartupLogicalReplication(XLogRecPtr checkPointRedo);
+
+extern LogicalDecodingContext *CreateLogicalDecodingContext(
+							 LogicalDecodingSlot *slot,
+							 bool is_init,
+							 XLogRecPtr	start_lsn,
+							 List *output_plugin_options,
+							 XLogPageReadCB read_page,
+						 LogicalOutputPluginWriterPrepareWrite prepare_write,
+							 LogicalOutputPluginWriterWrite do_write);
+extern bool LogicalDecodingContextReady(LogicalDecodingContext *ctx);
+extern void FreeLogicalDecodingContext(LogicalDecodingContext *ctx);
+
+#endif
diff --git a/src/include/replication/logicalfuncs.h b/src/include/replication/logicalfuncs.h
new file mode 100644
index 0000000..d6fd19c
--- /dev/null
+++ b/src/include/replication/logicalfuncs.h
@@ -0,0 +1,21 @@
+/*-------------------------------------------------------------------------
+ * logicalfuncs.h
+ *	   PostgreSQL WAL to logical transformation support functions
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICALFUNCS_H
+#define LOGICALFUNCS_H
+
+#include "replication/logical.h"
+
+extern int logical_read_local_xlog_page(XLogReaderState *state,
+							 XLogRecPtr targetPagePtr,
+							 int reqLen, XLogRecPtr targetRecPtr,
+							 char *cur_page, TimeLineID *pageTLI);
+
+extern Datum pg_stat_get_logical_decoding_slots(PG_FUNCTION_ARGS);
+
+#endif
diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h
new file mode 100644
index 0000000..a9fcc2d
--- /dev/null
+++ b/src/include/replication/output_plugin.h
@@ -0,0 +1,70 @@
+/*-------------------------------------------------------------------------
+ * output_plugin.h
+ *	   PostgreSQL Logical Decode Plugin Interface
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OUTPUT_PLUGIN_H
+#define OUTPUT_PLUGIN_H
+
+#include "replication/reorderbuffer.h"
+
+struct LogicalDecodingContext;
+
+/*
+ * Callback that gets called in a user-defined plugin. ctx->private_data can
+ * be set to some private data.
+ *
+ * "is_init" will be set to "true" if the decoding slot just got defined. When
+ * the same slot is used from there one, it will be "false".
+ *
+ * Gets looked up via the library symbol pg_decode_init.
+ */
+typedef void (*LogicalDecodeInitCB) (
+										  struct LogicalDecodingContext *ctx,
+												 bool is_init
+);
+
+/*
+ * Callback called for every BEGIN of a successful transaction.
+ *
+ * Gets looked up via the library symbol pg_decode_begin_txn.
+ */
+typedef void (*LogicalDecodeBeginCB) (
+											 struct LogicalDecodingContext *,
+												  ReorderBufferTXN *txn);
+
+/*
+ * Callback for every individual change in a successful transaction.
+ *
+ * Gets looked up via the library symbol pg_decode_change.
+ */
+typedef void (*LogicalDecodeChangeCB) (
+											 struct LogicalDecodingContext *,
+												   ReorderBufferTXN *txn,
+												   Relation relation,
+												   ReorderBufferChange *change
+);
+
+/*
+ * Called for every COMMIT of a successful transaction.
+ *
+ * Gets looked up via the library symbol pg_decode_commit_txn.
+ */
+typedef void (*LogicalDecodeCommitCB) (
+											 struct LogicalDecodingContext *,
+												   ReorderBufferTXN *txn,
+												   XLogRecPtr commit_lsn);
+
+/*
+ * Called to cleanup the state of an output plugin.
+ *
+ * Gets looked up via the library symbol pg_decode_cleanup.
+ */
+typedef void (*LogicalDecodeCleanupCB) (
+											  struct LogicalDecodingContext *
+);
+
+#endif   /* OUTPUT_PLUGIN_H */
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
new file mode 100644
index 0000000..7a4e046
--- /dev/null
+++ b/src/include/replication/reorderbuffer.h
@@ -0,0 +1,342 @@
+/*
+ * reorderbuffer.h
+ *
+ * PostgreSQL logical replay buffer management
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * src/include/replication/reorderbuffer.h
+ */
+#ifndef REORDERBUFFER_H
+#define REORDERBUFFER_H
+
+#include "access/htup_details.h"
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+
+#include "lib/ilist.h"
+
+#include "storage/sinval.h"
+
+#include "utils/snapshot.h"
+
+/* an individual tuple, stored in one chunk of memory */
+typedef struct ReorderBufferTupleBuf
+{
+	/* position in preallocated list */
+	slist_node	node;
+
+	/* tuple, stored sequentially */
+	HeapTupleData tuple;
+	HeapTupleHeaderData header;
+	char		data[MaxHeapTupleSize];
+} ReorderBufferTupleBuf;
+
+/* types of the change passed to a 'change' callback */
+enum ReorderBufferChangeType
+{
+	REORDER_BUFFER_CHANGE_INSERT,
+	REORDER_BUFFER_CHANGE_UPDATE,
+	REORDER_BUFFER_CHANGE_DELETE
+};
+
+/*
+ * a single 'change', can be an insert (with one tuple), an update (old, new),
+ * or a delete (old).
+ *
+ * The same struct is also used internally for other purposes but that should
+ * never be visible outside reorderbuffer.c.
+ */
+typedef struct ReorderBufferChange
+{
+	XLogRecPtr	lsn;
+
+	/* type of change */
+	union
+	{
+		enum ReorderBufferChangeType action;
+		/* do not leak internal enum values to the outside */
+		int			action_internal;
+	};
+
+	/*
+	 * Context data for the change, which part of the union is valid depends
+	 * on action/action_internal.
+	 */
+	union
+	{
+		/* old, new tuples when action == *_INSERT|UPDATE|DELETE */
+		struct
+		{
+			/* relation that has been changed */
+			RelFileNode relnode;
+			/* valid for DELETE || UPDATE */
+			ReorderBufferTupleBuf *oldtuple;
+			/* valid for INSERT || UPDATE */
+			ReorderBufferTupleBuf *newtuple;
+		};
+
+		/* new snapshot */
+		Snapshot	snapshot;
+
+		/* new command id for existing snapshot in a catalog changing tx */
+		CommandId	command_id;
+
+		/* new cid mapping for catalog changing transaction */
+		struct
+		{
+			RelFileNode node;
+			ItemPointerData tid;
+			CommandId	cmin;
+			CommandId	cmax;
+			CommandId	combocid;
+		}			tuplecid;
+	};
+
+	/*
+	 * While in use this is how a change is linked into a transactions,
+	 * otherwise it's the preallocated list.
+	 */
+	dlist_node	node;
+} ReorderBufferChange;
+
+typedef struct ReorderBufferTXN
+{
+	/*
+	 * The transactions transaction id, can be a toplevel or sub xid.
+	 */
+	TransactionId xid;
+
+	/*
+	 * LSN of the first data carrying, WAL record with knowledge about this
+	 * xid. This is allowed to *not* be first record adorned with this xid, if
+	 * the previous records aren't relevant for logical decoding.
+	 */
+	XLogRecPtr	first_lsn;
+
+	/* ----
+	 * LSN of the record that lead to this xact to be committed or
+	 * aborted. This can be a
+	 * * plain commit record
+	 * * plain commit record, of a parent transaction
+	 * * prepared transaction commit
+	 * * plain abort record
+	 * * prepared transaction abort
+	 * * error during decoding
+	 * ----
+	 */
+	XLogRecPtr	final_lsn;
+
+	/*
+	 * LSN pointing to the end of the commit record + 1.
+	 */
+	XLogRecPtr	end_lsn;
+
+	/*
+	 * LSN of the last lsn at which snapshot information reside, so we can
+	 * restart decoding from there and fully recover this transaction from
+	 * WAL.
+	 */
+	XLogRecPtr	restart_decoding_lsn;
+
+	/*
+	 * Base snapshot or NULL.
+	 */
+	Snapshot	base_snapshot;
+
+	/* did the TX have catalog changes */
+	bool		does_timetravel;
+
+	/*
+	 * Do we know this is a subxact?
+	 */
+	bool		is_known_as_subxact;
+
+	/*
+	 * How many ReorderBufferChange's do we have in this txn.
+	 *
+	 * Changes in subtransactions are *not* included but tracked separately.
+	 */
+	Size		nentries;
+
+	/*
+	 * How many of the above entries are stored in memory in contrast to being
+	 * spilled to disk.
+	 */
+	Size		nentries_mem;
+
+	/*
+	 * List of ReorderBufferChange structs, including new Snapshots and new
+	 * CommandIds
+	 */
+	dlist_head	changes;
+
+	/*
+	 * List of (relation, ctid) => (cmin, cmax) mappings for catalog tuples.
+	 * Those are always assigned to the toplevel transaction. (Keep track of
+	 * #entries to create a hash of the right size)
+	 */
+	dlist_head	tuplecids;
+	size_t		ntuplecids;
+
+	/*
+	 * On-demand built hash for looking up the above values.
+	 */
+	HTAB	   *tuplecid_hash;
+
+	/*
+	 * Hash containing (potentially partial) toast entries. NULL if no toast
+	 * tuples have been found for the current change.
+	 */
+	HTAB	   *toast_hash;
+
+	/*
+	 * non-hierarchical list of subtransactions that are *not* aborted. Only
+	 * used in toplevel transactions.
+	 */
+	dlist_head	subtxns;
+	size_t		nsubtxns;
+
+	/* ---
+	 * Position in one of three lists:
+	 * * list of subtransactions if we are *known* to be subxact
+	 * * list of toplevel xacts (can be a as-yet unknown subxact)
+	 * * list of preallocated ReorderBufferTXNs
+	 * ---
+	 */
+	dlist_node	node;
+
+	/*
+	 * Stored cache invalidations. This is not a linked list because we get
+	 * all the invalidations at once.
+	 */
+	SharedInvalidationMessage *invalidations;
+	size_t		ninvalidations;
+
+} ReorderBufferTXN;
+
+/* so we can define the callbacks used inside struct ReorderBuffer itself */
+typedef struct ReorderBuffer ReorderBuffer;
+
+/* change callback signature */
+typedef void (*ReorderBufferApplyChangeCB) (
+														ReorderBuffer *rb,
+														ReorderBufferTXN *txn,
+														Relation relation,
+												ReorderBufferChange *change);
+
+/* begin callback signature */
+typedef void (*ReorderBufferBeginCB) (
+												  ReorderBuffer *rb,
+												  ReorderBufferTXN *txn);
+
+/* commit callback signature */
+typedef void (*ReorderBufferCommitCB) (
+												   ReorderBuffer *rb,
+												   ReorderBufferTXN *txn,
+												   XLogRecPtr commit_lsn);
+
+struct ReorderBuffer
+{
+	/*
+	 * xid => ReorderBufferTXN lookup table
+	 */
+	HTAB	   *by_txn;
+
+	/*
+	 * Transactions that could be a toplevel xact, ordered by LSN of the first
+	 * record bearing that xid..
+	 */
+	dlist_head	toplevel_by_lsn;
+
+	/*
+	 * one-entry sized cache for by_txn. Very frequently the same txn gets
+	 * looked up over and over again.
+	 */
+	TransactionId by_txn_last_xid;
+	ReorderBufferTXN *by_txn_last_txn;
+
+	/*
+	 * Callacks to be called when a transactions commits.
+	 */
+	ReorderBufferBeginCB begin;
+	ReorderBufferApplyChangeCB apply_change;
+	ReorderBufferCommitCB commit;
+
+	/*
+	 * Pointer that will be passed untouched to the callbacks.
+	 */
+	void	   *private_data;
+
+	/*
+	 * Private memory context.
+	 */
+	MemoryContext context;
+
+	/*
+	 * Data structure slab cache.
+	 *
+	 * We allocate/deallocate some structures very frequently, to avoid bigger
+	 * overhead we cache some unused ones here.
+	 *
+	 * The maximum number of cached entries is controlled by const variables
+	 * ontop of reorderbuffer.c
+	 */
+
+	/* cached ReorderBufferTXNs */
+	dlist_head	cached_transactions;
+	Size		nr_cached_transactions;
+
+	/* cached ReorderBufferChanges */
+	dlist_head	cached_changes;
+	Size		nr_cached_changes;
+
+	/* cached ReorderBufferTupleBufs */
+	slist_head	cached_tuplebufs;
+	Size		nr_cached_tuplebufs;
+
+	XLogRecPtr	current_restart_decoding_lsn;
+
+	/* buffer for disk<->memory conversions */
+	char	   *outbuf;
+	Size		outbufsize;
+};
+
+
+ReorderBuffer *ReorderBufferAllocate(void);
+void		ReorderBufferFree(ReorderBuffer *);
+
+ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *);
+void		ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple);
+ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *);
+void		ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *);
+
+void		ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *);
+void		ReorderBufferCommit(ReorderBuffer *, TransactionId,
+								XLogRecPtr commit_lsn, XLogRecPtr end_lsn);
+void		ReorderBufferAssignChild(ReorderBuffer *, TransactionId, TransactionId, XLogRecPtr commit_lsn);
+void		ReorderBufferCommitChild(ReorderBuffer *, TransactionId, TransactionId,
+									 XLogRecPtr commit_lsn, XLogRecPtr end_lsn);
+void		ReorderBufferAbort(ReorderBuffer *, TransactionId, XLogRecPtr lsn);
+
+void		ReorderBufferSetBaseSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void		ReorderBufferAddSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void ReorderBufferAddNewCommandId(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							 CommandId cid);
+void ReorderBufferAddNewTupleCids(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							 RelFileNode node, ItemPointerData pt,
+						 CommandId cmin, CommandId cmax, CommandId combocid);
+void ReorderBufferAddInvalidations(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							  Size nmsgs, SharedInvalidationMessage *msgs);
+bool		ReorderBufferIsXidKnown(ReorderBuffer *, TransactionId xid);
+void		ReorderBufferXidSetTimetravel(ReorderBuffer *, TransactionId xid, XLogRecPtr lsn);
+bool		ReorderBufferXidDoesTimetravel(ReorderBuffer *, TransactionId xid);
+bool		ReorderBufferXidHasBaseSnapshot(ReorderBuffer *, TransactionId xid);
+
+ReorderBufferTXN *ReorderBufferGetOldestTXN(ReorderBuffer *);
+
+void		ReorderBufferSetRestartPoint(ReorderBuffer *, XLogRecPtr ptr);
+
+void		ReorderBufferStartup(void);
+
+#endif
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
new file mode 100644
index 0000000..7a4a217
--- /dev/null
+++ b/src/include/replication/snapbuild.h
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.h
+ *	  Exports from replication/logical/snapbuild.c.
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * src/include/replication/snapbuild.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SNAPBUILD_H
+#define SNAPBUILD_H
+
+#include "access/xlogdefs.h"
+#include "utils/snapmgr.h"
+
+typedef enum
+{
+	/*
+	 * Initial state, we can't do much yet.
+	 */
+	SNAPBUILD_START,
+
+	/*
+	 * We have collected enough information to decode tuples in transactions
+	 * that started after this.
+	 *
+	 * Once we reached this we start to collect changes. We cannot apply them
+	 * yet because the might be based on transactions that were still running
+	 * when we reached them yet.
+	 */
+	SNAPBUILD_FULL_SNAPSHOT,
+
+	/*
+	 * Found a point after hitting built_full_snapshot where all transactions
+	 * that were running at that point finished. Till we reach that we hold
+	 * off calling any commit callbacks.
+	 */
+	SNAPBUILD_CONSISTENT
+} SnapBuildState;
+
+/* forward declare so we don't have to expose the struct to the public */
+struct SnapBuild;
+typedef struct SnapBuild SnapBuild;
+
+/* forward declare so we don't have to include xlogreader.h */
+struct XLogRecordBuffer;
+struct ReorderBuffer;
+
+extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache,
+						  TransactionId xmin_horizon, XLogRecPtr start_lsn);
+extern void FreeSnapshotBuilder(SnapBuild *cache);
+
+extern void SnapBuildSnapDecRefcount(Snapshot snap);
+
+extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate);
+extern void SnapBuildClearExportedSnapshot(void);
+
+extern SnapBuildState SnapBuildCurrentState(SnapBuild *snapstate);
+
+extern bool SnapBuildXactNeedsSkip(SnapBuild *snapstate, XLogRecPtr ptr);
+
+/* don't want to include heapam_xlog.h */
+struct xl_heap_new_cid;
+struct xl_running_xacts;
+
+extern void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn,
+							   TransactionId xid, int nsubxacts,
+							   TransactionId *subxacts);
+extern void SnapBuildAbortTxn(SnapBuild *builder, TransactionId xid,
+							  int nsubxacts, TransactionId *subxacts);
+extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
+								   XLogRecPtr lsn);
+extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+								   XLogRecPtr lsn, struct xl_heap_new_cid *cid);
+extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
+										 struct xl_running_xacts *running);
+extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
+
+#endif   /* SNAPBUILD_H */
diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 7eaa21b..daae320 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -66,6 +66,7 @@ typedef struct WalSnd
 
 extern WalSnd *MyWalSnd;
 
+
 /* There is one WalSndCtl struct for the whole database cluster */
 typedef struct
 {
@@ -93,7 +94,6 @@ typedef struct
 
 extern WalSndCtlData *WalSndCtl;
 
-
 extern void WalSndSetState(WalSndState state);
 
 /*
@@ -108,4 +108,8 @@ extern void replication_scanner_finish(void);
 
 extern Node *replication_parse_result;
 
+/* logical wal sender data gathering functions */
+extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
+
+
 #endif   /* _WALSENDER_PRIVATE_H */
diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h
index e0eb184..75c56a9 100644
--- a/src/include/storage/itemptr.h
+++ b/src/include/storage/itemptr.h
@@ -116,6 +116,9 @@ typedef ItemPointerData *ItemPointer;
 /*
  * ItemPointerCopy
  *		Copies the contents of one disk item pointer to another.
+ *
+ * Should there ever be padding in an ItemPointer this would need to be handled
+ * differently as it's used as hash key.
  */
 #define ItemPointerCopy(fromPointer, toPointer) \
 ( \
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 39415a3..a33d6cf 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -80,6 +80,7 @@ typedef enum LWLockId
 	OldSerXidLock,
 	SyncRepLock,
 	BackgroundWorkerLock,
+	LogicalReplicationCtlLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index c5f58b4..744317e 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -50,7 +50,7 @@ extern RunningTransactions GetRunningTransactionData(void);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsActive(TransactionId xid);
-extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum, bool systable, bool alreadyLocked);
 extern TransactionId GetOldestActiveTransactionId(void);
 
 extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 7e70e57..5448818 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -147,4 +147,6 @@ extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs
 									 int nmsgs, bool RelcacheInitFileInval,
 									 Oid dbid, Oid tsid);
 
+extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg);
+
 #endif   /* SINVAL_H */
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index 6fd6e1e..5424912 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -64,4 +64,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 
 extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
 
+extern void InvalidateSystemCaches(void);
 #endif   /* INVAL_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 0281b4b..6a4d2d5 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -104,6 +104,7 @@ typedef struct RelationData
 	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
 	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
 	Bitmapset  *rd_keyattr;		/* cols that can be ref'd by foreign keys */
+	Bitmapset  *rd_ckeyattr;	/* cols that are included ref'd by pkey */
 	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
 	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
 	RuleLock   *rd_rules;		/* rewrite rules */
@@ -221,6 +222,7 @@ typedef struct StdRdOptions
 	AutoVacOpts autovacuum;		/* autovacuum-related options */
 	bool		security_barrier;		/* for views */
 	int			check_option_offset;	/* for views */
+	bool        treat_as_catalog_table; /* treat as timetraveleable table */
 } StdRdOptions;
 
 #define HEAP_MIN_FILLFACTOR			10
@@ -290,6 +292,15 @@ typedef struct StdRdOptions
 			"cascaded") == 0 : false)
 
 /*
+ * RelationIsTreatedAsCatalogTable
+ *		Returns whether the relation should be treated as a catalog table
+ *      from the pov of logical decoding.
+ */
+#define RelationIsTreatedAsCatalogTable(relation)	\
+	((relation)->rd_options ?				\
+	 ((StdRdOptions *) (relation)->rd_options)->treat_as_catalog_table : false)
+
+/*
  * RelationIsValid
  *		True iff relation descriptor is valid.
  */
@@ -441,7 +452,6 @@ typedef struct StdRdOptions
 	((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
 	 !(relation)->rd_islocaltemp)
 
-
 /*
  * RelationIsScannable
  *		Currently can only be false for a materialized view which has not been
@@ -458,6 +468,24 @@ typedef struct StdRdOptions
  */
 #define RelationIsPopulated(relation) ((relation)->rd_rel->relispopulated)
 
+/*
+ * RelationIsDoingTimetravel
+ *		True if we need to log enough information to provide timetravel access
+ */
+#define RelationIsDoingTimetravel(relation) \
+	(wal_level >= WAL_LEVEL_LOGICAL && \
+	 RelationIsDoingTimetravelInternal(relation))
+
+/*
+ * RelationIsLogicallyLogged
+ *		True if we need to log enough information to provide timetravel access
+ */
+#define RelationIsLogicallyLogged(relation) \
+	(wal_level >= WAL_LEVEL_LOGICAL && \
+	 RelationIsLogicallyLoggedInternal(relation))
+
+extern bool RelationIsDoingTimetravelInternal(Relation relation);
+extern bool RelationIsLogicallyLoggedInternal(Relation relation);
 
 /* routines in utils/cache/relcache.c */
 extern void RelationIncrementReferenceCount(Relation rel);
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 8ac2549..cfeded8 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -41,7 +41,16 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid	RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
-extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs);
+
+typedef enum IndexAttrBitmapKind {
+	INDEX_ATTR_BITMAP_ALL,
+	INDEX_ATTR_BITMAP_KEY,
+	INDEX_ATTR_BITMAP_CANDIDATE_KEY
+}  IndexAttrBitmapKind;
+
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation,
+											 IndexAttrBitmapKind keyAttrs);
+
 extern void RelationGetExclusionInfo(Relation indexRelation,
 						 Oid **operators,
 						 Oid **procs,
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index 81a286c..2187f58 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -23,6 +23,7 @@ extern bool FirstSnapshotSet;
 extern TransactionId TransactionXmin;
 extern TransactionId RecentXmin;
 extern TransactionId RecentGlobalXmin;
+extern TransactionId RecentGlobalDataXmin;
 
 extern Snapshot GetTransactionSnapshot(void);
 extern Snapshot GetLatestSnapshot(void);
@@ -53,4 +54,6 @@ extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
 extern bool ThereAreNoPriorRegisteredSnapshots(void);
 
+extern char *ExportSnapshot(Snapshot snapshot);
+
 #endif   /* SNAPMGR_H */
diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h
index 19f56e4..cd3f880 100644
--- a/src/include/utils/tqual.h
+++ b/src/include/utils/tqual.h
@@ -22,6 +22,7 @@
 extern PGDLLIMPORT SnapshotData SnapshotSelfData;
 extern PGDLLIMPORT SnapshotData SnapshotAnyData;
 extern PGDLLIMPORT SnapshotData SnapshotToastData;
+extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 
 #define SnapshotSelf		(&SnapshotSelfData)
 #define SnapshotAny			(&SnapshotAnyData)
@@ -37,7 +38,8 @@ extern PGDLLIMPORT SnapshotData SnapshotToastData;
 
 /* This macro encodes the knowledge of which snapshots are MVCC-safe */
 #define IsMVCCSnapshot(snapshot)  \
-	((snapshot)->satisfies == HeapTupleSatisfiesMVCC)
+	((snapshot)->satisfies == HeapTupleSatisfiesMVCC || \
+	 (snapshot)->satisfies == HeapTupleSatisfiesMVCCDuringDecoding)
 
 /*
  * HeapTupleSatisfiesVisibility
@@ -86,4 +88,21 @@ extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
 					 uint16 infomask, TransactionId xid);
 extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
 
+/* Support for catalog timetravel */
+struct HTAB;
+extern bool HeapTupleSatisfiesMVCCDuringDecoding(HeapTuple htup,
+                                                 Snapshot snapshot, Buffer buffer);
+extern void SetupDecodingSnapshots(Snapshot snapshot_now, struct HTAB *tuplecids);
+extern void RevertFromDecodingSnapshots(void);
+extern void SuspendDecodingSnapshots(void);
+extern void UnSuspendDecodingSnapshots(void);
+
+/*
+ * To avoid leaking to much knowledge about reorderbuffer implementation
+ * details this is implemented in reorderbuffer.c not tqual.c.
+ */
+extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, HeapTuple htup,
+										  Buffer buffer,
+										  CommandId *cmin, CommandId *cmax);
+
 #endif   /* TQUAL_H */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 8f24c51..d49e499 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1679,6 +1679,13 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
                                  |     pg_stat_get_db_conflict_bufferpin(d.oid) AS confl_bufferpin,                                                                                                                                               +
                                  |     pg_stat_get_db_conflict_startup_deadlock(d.oid) AS confl_deadlock                                                                                                                                          +
                                  |    FROM pg_database d;
+ pg_stat_logical_decoding        |  SELECT l.slot_name,                                                                                                                                                                                           +
+                                 |     l.plugin,                                                                                                                                                                                                  +
+                                 |     l.database,                                                                                                                                                                                                +
+                                 |     l.active,                                                                                                                                                                                                  +
+                                 |     l.xmin,                                                                                                                                                                                                    +
+                                 |     l.restart_decoding_lsn                                                                                                                                                                                     +
+                                 |    FROM pg_stat_get_logical_decoding_slots() l(slot_name, plugin, database, active, xmin, restart_decoding_lsn);
  pg_stat_replication             |  SELECT s.pid,                                                                                                                                                                                                 +
                                  |     s.usesysid,                                                                                                                                                                                                +
                                  |     u.rolname AS usename,                                                                                                                                                                                      +
@@ -2142,7 +2149,7 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
                                  |    FROM tv;
  tvvmv                           |  SELECT tvvm.grandtot                                                                                                                                                                                          +
                                  |    FROM tvvm;
-(64 rows)
+(65 rows)
 
 SELECT tablename, rulename, definition FROM pg_rules
 	ORDER BY tablename, rulename;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b20eb0d..648caa0 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -621,6 +621,7 @@ Form_pg_ts_template
 Form_pg_type
 Form_pg_user_mapping
 FormatNode
+FreeLogicalReplicationCmd
 FromCharDateMode
 FromExpr
 FuncCall
@@ -791,6 +792,7 @@ IdentifySystemCmd
 IncrementVarSublevelsUp_context
 Index
 IndexArrayKeyInfo
+IndexAttrBitmapKind
 IndexBuildCallback
 IndexBuildResult
 IndexBulkDeleteCallback
@@ -818,6 +820,7 @@ IndxInfo
 InfoItem
 InhInfo
 InhOption
+InitLogicalReplicationCmd
 InheritableSocket
 InlineCodeBlock
 InsertStmt
@@ -937,6 +940,17 @@ LockTupleMode
 LockingClause
 LogOpts
 LogStmtLevel
+LogicalDecodeBeginCB
+LogicalDecodeChangeCB
+LogicalDecodeCleanupCB
+LogicalDecodeCommitCB
+LogicalDecodeInitCB
+LogicalDecodingCheckpointData
+LogicalDecodingContext
+LogicalDecodingCtlData
+LogicalDecodingSlot
+LogicalOutputPluginWriterPrepareWrite
+LogicalOutputPluginWriterWrite
 LogicalTape
 LogicalTapeSet
 MAGIC
@@ -1050,6 +1064,7 @@ OprInfo
 OprProofCacheEntry
 OprProofCacheKey
 OutputContext
+OutputPluginCallbacks
 OverrideSearchPath
 OverrideStackEntry
 PACE_HEADER
@@ -1464,6 +1479,21 @@ Relids
 RelocationBufferInfo
 RenameStmt
 ReopenPtr
+ReorderBuffer
+ReorderBufferApplyChangeCB
+ReorderBufferBeginCB
+ReorderBufferChange
+ReorderBufferChangeTypeInternal
+ReorderBufferCommitCB
+ReorderBufferDiskChange
+ReorderBufferIterTXNEntry
+ReorderBufferIterTXNState
+ReorderBufferToastEnt
+ReorderBufferTupleBuf
+ReorderBufferTupleCidEnt
+ReorderBufferTupleCidKey
+ReorderBufferTXN
+ReorderBufferTXNByIdEnt
 ReplaceVarsFromTargetList_context
 ReplaceVarsNoMatchOption
 ResTarget
@@ -1518,6 +1548,8 @@ SID_NAME_USE
 SISeg
 SMgrRelation
 SMgrRelationData
+SnapBuildAction
+SnapBuildState
 SOCKADDR
 SOCKET
 SPELL
@@ -1609,6 +1641,8 @@ SlruSharedData
 Snapshot
 SnapshotData
 SnapshotSatisfiesFunc
+Snapstate
+SnapstateOnDisk
 SockAddr
 Sort
 SortBy
@@ -1651,6 +1685,7 @@ StandardChunkHeader
 StartBlobPtr
 StartBlobsPtr
 StartDataPtr
+StartLogicalReplicationCmd
 StartReplicationCmd
 StartupPacket
 StatEntry
@@ -1874,6 +1909,7 @@ WalRcvData
 WalRcvState
 WalSnd
 WalSndCtlData
+WalSndSendData
 WalSndState
 WholeRowVarExprState
 WindowAgg
@@ -1925,6 +1961,7 @@ XLogReaderState
 XLogRecData
 XLogRecPtr
 XLogRecord
+XLogRecordBuffer
 XLogSegNo
 XLogSource
 XLogwrtResult
@@ -2347,6 +2384,7 @@ symbol
 tablespaceinfo
 teReqs
 teSection
+TestDecodingData
 temp_tablespaces_extra
 text
 timeKEY
@@ -2419,11 +2457,13 @@ xl_heap_cleanup_info
 xl_heap_delete
 xl_heap_freeze
 xl_heap_header
+xl_heap_header_len
 xl_heap_inplace
 xl_heap_insert
 xl_heap_lock
 xl_heap_lock_updated
 xl_heap_multi_insert
+xl_heap_new_cid
 xl_heap_newpage
 xl_heap_update
 xl_heap_visible
-- 
1.8.4.21.g992c386.dirty

