From 2a1e6241e3fa90fa04219c7b215f4c5f2faacba0 Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikolaev@gmail.com>
Date: Wed, 12 May 2021 22:55:50 +0300
Subject: [PATCH v3 1/4] code

---
 src/backend/access/gist/gistget.c        | 43 +++++++++++++++++--
 src/backend/access/gist/gistxlog.c       | 15 +++++++
 src/backend/access/hash/hash.c           |  4 +-
 src/backend/access/hash/hash_xlog.c      | 17 ++++++++
 src/backend/access/hash/hashsearch.c     | 18 ++++++--
 src/backend/access/hash/hashutil.c       | 33 ++++++++++++++-
 src/backend/access/heap/heapam.c         | 42 ++++++++++++++-----
 src/backend/access/heap/heapam_handler.c |  5 ++-
 src/backend/access/index/genam.c         | 20 ++++-----
 src/backend/access/index/indexam.c       | 26 ++++++++----
 src/backend/access/nbtree/nbtinsert.c    | 22 +++++++---
 src/backend/access/nbtree/nbtree.c       |  4 +-
 src/backend/access/nbtree/nbtsearch.c    | 14 ++++++-
 src/backend/access/nbtree/nbtutils.c     | 33 ++++++++++++++-
 src/backend/access/nbtree/nbtxlog.c      | 16 +++++++
 src/backend/access/table/tableam.c       |  4 +-
 src/backend/access/transam/rmgr.c        |  4 +-
 src/backend/access/transam/xlogutils.c   |  6 +++
 src/backend/storage/buffer/bufmgr.c      | 53 ++++++++++++++++++++++++
 src/bin/pg_rewind/parsexlog.c            |  2 +-
 src/bin/pg_waldump/rmgrdesc.c            |  2 +-
 src/include/access/gist.h                |  5 +++
 src/include/access/gistxlog.h            |  1 +
 src/include/access/hash.h                |  2 +
 src/include/access/hash_xlog.h           |  1 +
 src/include/access/heapam.h              |  2 +-
 src/include/access/nbtree.h              |  2 +
 src/include/access/nbtxlog.h             |  1 +
 src/include/access/relscan.h             | 15 +++++--
 src/include/access/rmgr.h                |  2 +-
 src/include/access/rmgrlist.h            | 46 ++++++++++----------
 src/include/access/tableam.h             | 14 ++++---
 src/include/access/xlog_internal.h       |  4 ++
 src/include/storage/bufmgr.h             | 11 +++++
 34 files changed, 399 insertions(+), 90 deletions(-)

diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index c8f7e781c6..10bb60aff4 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "access/relscan.h"
@@ -49,6 +50,7 @@ gistkillitems(IndexScanDesc scan)
 	Assert(so->curBlkno != InvalidBlockNumber);
 	Assert(!XLogRecPtrIsInvalid(so->curPageLSN));
 	Assert(so->killedItems != NULL);
+	Assert(so->numKilled > 0);
 
 	buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
 	if (!BufferIsValid(buffer))
@@ -62,8 +64,13 @@ gistkillitems(IndexScanDesc scan)
 	 * If page LSN differs it means that the page was modified since the last
 	 * read. killedItems could be not valid so LP_DEAD hints applying is not
 	 * safe.
+	 *
+	 * Another case - standby was promoted after start of current transaction.
+	 * It is not required for correctness, but it is better to just skip
+	 * everything.
 	 */
-	if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+	if ((BufferGetLSNAtomic(buffer) != so->curPageLSN) ||
+			(scan->xactStartedInRecovery && !RecoveryInProgress()))
 	{
 		UnlockReleaseBuffer(buffer);
 		so->numKilled = 0;		/* reset counter */
@@ -71,6 +78,20 @@ gistkillitems(IndexScanDesc scan)
 	}
 
 	Assert(GistPageIsLeaf(page));
+	if (GistPageHasLpSafeOnStandby(page) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+	else if (!GistPageHasLpSafeOnStandby(page) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		GistMarkPageHasLpSafeOnStandby(page);
+	}
 
 	/*
 	 * Mark all killedItems as dead. We need no additional recheck, because,
@@ -338,6 +359,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	MemoryContext oldcxt;
+	bool ignore_killed_tuples;
 
 	Assert(!GISTSearchItemIsHeap(*pageItem));
 
@@ -412,6 +434,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	 * check all tuples on page
 	 */
 	maxoff = PageGetMaxOffsetNumber(page);
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									GistPageHasLpSafeOnStandby(page);
 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
 		ItemId		iid = PageGetItemId(page, i);
@@ -424,7 +455,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		 * If the scan specifies not to return killed tuples, then we treat a
 		 * killed tuple as not passing the qual.
 		 */
-		if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+		if (ignore_killed_tuples && ItemIdIsDead(iid))
 			continue;
 
 		it = (IndexTuple) PageGetItem(page, iid);
@@ -651,7 +682,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 		{
 			if (so->curPageData < so->nPageData)
 			{
-				if (scan->kill_prior_tuple && so->curPageData > 0)
+				if (scan->kill_prior_tuple && so->curPageData > 0 &&
+					(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 				{
 
 					if (so->killedItems == NULL)
@@ -688,7 +721,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			 */
 			if (scan->kill_prior_tuple
 				&& so->curPageData > 0
-				&& so->curPageData == so->nPageData)
+				&& so->curPageData == so->nPageData
+				&& (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 			{
 
 				if (so->killedItems == NULL)
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 6464cb9281..210044e512 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -451,6 +451,20 @@ gist_xlog_cleanup(void)
 	MemoryContextDelete(opCtx);
 }
 
+/*
+ * Mask a Gist page that LP_DEAD bits are not safe for the standby.
+ */
+void
+gist_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+
+	if (GistPageIsLeaf(page))
+	{
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+}
+
 /*
  * Mask a Gist page before running consistency checks on it.
  */
@@ -459,6 +473,7 @@ gist_mask(char *pagedata, BlockNumber blkno)
 {
 	Page		page = (Page) pagedata;
 
+	gist_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0752fb38a9..339a6bf8b7 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -295,8 +295,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 	{
 		/*
 		 * Check to see if we should kill the previously-fetched tuple.
+		 * If the tuple is marked as dead but with min LSN - treat it as alive.
 		 */
-		if (scan->kill_prior_tuple)
+		if (scan->kill_prior_tuple &&
+				XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn))
 		{
 			/*
 			 * Yes, so remember it for later. (We'll deal with all such tuples
diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c
index af35a991fc..093315e8c4 100644
--- a/src/backend/access/hash/hash_xlog.c
+++ b/src/backend/access/hash/hash_xlog.c
@@ -1101,6 +1101,22 @@ hash_redo(XLogReaderState *record)
 	}
 }
 
+/*
+ * Mask a hash page that LP_DEAD bits are not safe for the standby.
+ */
+void
+hash_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	int			pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
+
+	if (pagetype == LH_BUCKET_PAGE || pagetype == LH_OVERFLOW_PAGE)
+	{
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a hash page before performing consistency checks on it.
  */
@@ -1111,6 +1127,7 @@ hash_mask(char *pagedata, BlockNumber blkno)
 	HashPageOpaque opaque;
 	int			pagetype;
 
+	hash_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 2ffa28e8f7..2ea8ecd173 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -612,9 +612,21 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 	IndexTuple	itup;
 	int			itemIndex;
 	OffsetNumber maxoff;
+	bool		ignore_killed_tuples;
+	HashPageOpaque bucket_opaque;
 
 	maxoff = PageGetMaxOffsetNumber(page);
+	bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									H_LP_SAFE_ON_STANDBY(bucket_opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -632,8 +644,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberNext(offnum);	/* move forward */
 				continue;
@@ -678,8 +689,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberPrev(offnum);	/* move back */
 				continue;
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 519872850e..fa8453660d 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/hash.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -547,6 +548,7 @@ _hash_kill_items(IndexScanDesc scan)
 	int			numKilled = so->numKilled;
 	int			i;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		havePin = false;
 
 	Assert(so->numKilled > 0);
@@ -559,6 +561,15 @@ _hash_kill_items(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	blkno = so->currPos.currPage;
 	if (HashScanPosIsPinned(so->currPos))
 	{
@@ -577,6 +588,23 @@ _hash_kill_items(IndexScanDesc scan)
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (H_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!H_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by the primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		opaque->hasho_flag |= LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -596,7 +624,7 @@ _hash_kill_items(IndexScanDesc scan)
 			{
 				/* found the item */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -611,6 +639,9 @@ _hash_kill_items(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(buf, true);
 	}
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ba36da2b83..8ed4064151 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1698,9 +1698,11 @@ heap_fetch(Relation relation,
  * the tuple here, in addition to updating *tid.  If no match is found, the
  * contents of this buffer on return are undefined.
  *
- * If all_dead is not NULL, we check non-visible tuples to see if they are
- * globally dead; *all_dead is set true if all members of the HOT chain
- * are vacuumable, false if not.
+ * If deadness is not NULL, we check non-visible tuples to see if they
+ * are globally dead; *all_dead is set true if all members of the HOT chain
+ * are vacuumable, false if not. Also, *latest_removed_xid is set to the
+ * latest removed xid in a HOT chain, if known. *page_lsn is set to current page
+ * LSN value.
  *
  * Unlike heap_fetch, the caller must already have pin and (at least) share
  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
@@ -1709,7 +1711,7 @@ heap_fetch(Relation relation,
 bool
 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 					   Snapshot snapshot, HeapTuple heapTuple,
-					   bool *all_dead, bool first_call)
+					   TupleDeadnessData *deadness, bool first_call)
 {
 	Page		dp = (Page) BufferGetPage(buffer);
 	TransactionId prev_xmax = InvalidTransactionId;
@@ -1721,8 +1723,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 	GlobalVisState *vistest = NULL;
 
 	/* If this is not the first call, previous call returned a (live!) tuple */
-	if (all_dead)
-		*all_dead = first_call;
+	if (deadness)
+	{
+		deadness->all_dead = first_call;
+		deadness->latest_removed_xid = InvalidTransactionId;
+		deadness->page_lsn = PageGetLSN(dp);
+	}
 
 	blkno = ItemPointerGetBlockNumber(tid);
 	offnum = ItemPointerGetOffsetNumber(tid);
@@ -1755,6 +1761,13 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				at_chain_start = false;
 				continue;
 			}
+			/*
+			 * Even if all items are dead we are not sure about latest_removed_xid
+			 * value. In theory, some newer items of the chain could be vacuumed
+			 * while older are not (pure paranoia, probably).
+			 */
+			if (deadness)
+				deadness->latest_removed_xid = InvalidTransactionId;
 			/* else must be end of chain */
 			break;
 		}
@@ -1804,8 +1817,11 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				ItemPointerSetOffsetNumber(tid, offnum);
 				PredicateLockTID(relation, &heapTuple->t_self, snapshot,
 								 HeapTupleHeaderGetXmin(heapTuple->t_data));
-				if (all_dead)
-					*all_dead = false;
+				if (deadness)
+				{
+					deadness->all_dead = false;
+					deadness->latest_removed_xid = InvalidTransactionId;
+				}
 				return true;
 			}
 		}
@@ -1819,13 +1835,19 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 * Note: if you change the criterion here for what is "dead", fix the
 		 * planner's get_actual_variable_range() function to match.
 		 */
-		if (all_dead && *all_dead)
+		if (deadness && deadness->all_dead)
 		{
 			if (!vistest)
 				vistest = GlobalVisTestFor(relation);
 
 			if (!HeapTupleIsSurelyDead(heapTuple, vistest))
-				*all_dead = false;
+			{
+				deadness->all_dead = false;
+				deadness->latest_removed_xid = InvalidTransactionId;
+			}
+			else
+				HeapTupleHeaderAdvanceLatestRemovedXid(heapTuple->t_data,
+											&deadness->latest_removed_xid);
 		}
 
 		/*
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 7a9a640989..b57c40b0c9 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -115,7 +115,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 						 ItemPointer tid,
 						 Snapshot snapshot,
 						 TupleTableSlot *slot,
-						 bool *call_again, bool *all_dead)
+						 bool *call_again,
+						 TupleDeadnessData *deadness)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
@@ -147,7 +148,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 											hscan->xs_cbuf,
 											snapshot,
 											&bslot->base.tupdata,
-											all_dead,
+											deadness,
 											!*call_again);
 	bslot->base.tupdata.t_self = *tid;
 	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0aa26b448b..1818901577 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -106,18 +106,18 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_want_itup = false; /* may be set later */
 
 	/*
-	 * During recovery we ignore killed tuples and don't bother to kill them
-	 * either. We do this because the xmin on the primary node could easily be
-	 * later than the xmin on the standby node, so that what the primary
-	 * thinks is killed is supposed to be visible on standby. So for correct
-	 * MVCC for queries during recovery we must ignore these hints and check
-	 * all tuples. Do *not* set ignore_killed_tuples to true when running in a
-	 * transaction that was started during recovery. xactStartedInRecovery
-	 * should not be altered by index AMs.
-	 */
+	 * For correct MVCC for queries during recovery, we could use index LP_DEAD
+	 * bits as on the primary. But index AM should consider that it is possible
+	 * to receive such bits as part of FPI. The xmin on the primary node could
+	 * easily be later than the xmin on the standby node, so that what the
+	 * primary thinks is killed is supposed to be visible on standby.
+	 *
+	 * So for correct MVCC for queries during recovery we must mask these FPI
+	 * hints and check all tuples until standby-safe hints are set.
+	*/
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
-	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
 	scan->opaque = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 5e22479b7a..c1b1802b1a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -309,6 +309,7 @@ index_rescan(IndexScanDesc scan,
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
@@ -386,6 +387,7 @@ index_restrpos(IndexScanDesc scan)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
@@ -534,6 +536,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 
 	/* Reset kill flag immediately for safety */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	/* If we're out of index entries, we're done */
@@ -574,12 +577,18 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 bool
 index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 {
-	bool		all_dead = false;
-	bool		found;
+	TupleDeadnessData			deadness;
+	IndexLpDeadAllowedResult	kill_allowed;
+	bool						found;
+
+	deadness.all_dead = false;
+	deadness.latest_removed_xid = InvalidTransactionId;
+	deadness.page_lsn = InvalidXLogRecPtr;
 
 	found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid,
 									scan->xs_snapshot, slot,
-									&scan->xs_heap_continue, &all_dead);
+									&scan->xs_heap_continue,
+									&deadness);
 
 	if (found)
 		pgstat_count_heap_fetch(scan->indexRelation);
@@ -587,13 +596,11 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	/*
 	 * If we scanned a whole HOT chain and found only dead tuples, tell index
 	 * AM to kill its entry for that TID (this will take effect in the next
-	 * amgettuple call, in index_getnext_tid).  We do not do this when in
-	 * recovery because it may violate MVCC to do so.  See comments in
-	 * RelationGetIndexScan().
+	 * amgettuple call, in index_getnext_tid). We do this when in
+	 * recovery only in certain conditions because it may violate MVCC.
 	 */
-	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
-
+	kill_allowed = IsIndexLpDeadAllowed(&deadness, &scan->kill_prior_tuple_min_lsn);
+	scan->kill_prior_tuple = (kill_allowed != INDEX_LP_DEAD_NOT_OK);
 	return found;
 }
 
@@ -667,6 +674,7 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
 
 	/* just make sure this is false... */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 
 	/*
 	 * have the am's getbitmap proc do all the work.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 6ac205c98e..caea430f5d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -15,6 +15,7 @@
 
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
 #include "access/transam.h"
@@ -502,7 +503,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			if (inposting || !ItemIdIsDead(curitemid))
 			{
 				ItemPointerData htid;
-				bool		all_dead = false;
+				TupleDeadnessData deadness;
+
+				deadness.all_dead = false;
+				deadness.latest_removed_xid = InvalidTransactionId;
+				deadness.page_lsn = InvalidXLogRecPtr;
 
 				if (!inposting)
 				{
@@ -556,7 +561,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 */
 				else if (table_index_fetch_tuple_check(heapRel, &htid,
 													   &SnapshotDirty,
-													   &all_dead))
+													   &deadness))
 				{
 					TransactionId xwait;
 
@@ -670,8 +675,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead && (!inposting ||
-									  (prevalldead &&
+				else if (deadness.all_dead && (!inposting ||
+											   (prevalldead &&
 									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
 				{
 					/*
@@ -679,6 +684,13 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * all posting list TIDs) is dead to everyone, so mark the
 					 * index entry killed.
 					 */
+					Assert(!RecoveryInProgress());
+					if (P_LP_SAFE_ON_STANDBY(opaque))
+					{
+						/* Seems like server was promoted some time ago,
+						 * clear the flag just for accuracy. */
+						opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+					}
 					ItemIdMarkDead(curitemid);
 					opaque->btpo_flags |= BTP_HAS_GARBAGE;
 
@@ -696,7 +708,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 * Remember if posting list tuple has even a single HOT chain
 				 * whose members are not all dead
 				 */
-				if (!all_dead && inposting)
+				if (!deadness.all_dead && inposting)
 					prevalldead = false;
 			}
 		}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1360ab80c1..927c62415c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -244,7 +244,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			/*
 			 * Check to see if we should kill the previously-fetched tuple.
 			 */
-			if (scan->kill_prior_tuple)
+			if (scan->kill_prior_tuple &&
+				(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+					scan->kill_prior_tuple_min_lsn < so->currPos.lsn))
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index d1177d8772..f7e57a6610 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1516,6 +1516,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		ignore_killed_tuples;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1569,6 +1570,15 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+										P_LP_SAFE_ON_STANDBY(opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1585,7 +1595,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * If the scan specifies not to return killed tuples, then we
 			 * treat a killed tuple as not passing the qual
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				offnum = OffsetNumberNext(offnum);
 				continue;
@@ -1685,7 +1695,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * uselessly advancing to the page to the left.  This is similar
 			 * to the high key optimization used by forward scans.
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				Assert(offnum >= P_FIRSTDATAKEY(opaque));
 				if (offnum > P_FIRSTDATAKEY(opaque))
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index d524310723..df5179a9d8 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -17,6 +17,7 @@
 
 #include <time.h>
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -1725,6 +1726,7 @@ _bt_killitems(IndexScanDesc scan)
 	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
 
 	Assert(BTScanPosIsValid(so->currPos));
@@ -1735,6 +1737,15 @@ _bt_killitems(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	if (BTScanPosIsPinned(so->currPos))
 	{
 		/*
@@ -1771,6 +1782,23 @@ _bt_killitems(IndexScanDesc scan)
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (P_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!P_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		opaque->btpo_flags |= BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -1866,7 +1894,7 @@ _bt_killitems(IndexScanDesc scan)
 			{
 				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -1883,6 +1911,9 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 1779b6ba47..2b3601dbb1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -1084,6 +1084,21 @@ btree_xlog_cleanup(void)
 	opCtx = NULL;
 }
 
+/*
+ * Mask a btree page that LP_DEAD bits are not safe for the standby.
+ */
+void
+btree_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	BTPageOpaque maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (P_ISLEAF(maskopaq))
+	{
+		maskopaq->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a btree page before performing consistency checks on it.
  */
@@ -1093,6 +1108,7 @@ btree_mask(char *pagedata, BlockNumber blkno)
 	Page		page = (Page) pagedata;
 	BTPageOpaque maskopaq;
 
+	btree_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 5ea5bdd810..e921960a88 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -219,7 +219,7 @@ bool
 table_index_fetch_tuple_check(Relation rel,
 							  ItemPointer tid,
 							  Snapshot snapshot,
-							  bool *all_dead)
+							  TupleDeadnessData *deadness)
 {
 	IndexFetchTableData *scan;
 	TupleTableSlot *slot;
@@ -229,7 +229,7 @@ table_index_fetch_tuple_check(Relation rel,
 	slot = table_slot_create(rel, NULL);
 	scan = table_index_fetch_begin(rel);
 	found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
-									all_dead);
+									deadness);
 	table_index_fetch_end(scan);
 	ExecDropSingleTupleTableSlot(slot);
 
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..f9e7733da4 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -30,8 +30,8 @@
 #include "utils/relmapper.h"
 
 /* must be kept in sync with RmgrData definition in xlog_internal.h */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
-	{ name, redo, desc, identify, startup, cleanup, mask },
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
+	{ name, redo, desc, identify, startup, cleanup, mask, fpi_mask },
 
 const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 #include "access/rmgrlist.h"
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f46..4092e7d8a6 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -332,6 +332,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 							  Buffer *buf)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
+	RmgrId		rmid = XLogRecGetRmid(record);
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
@@ -373,6 +374,11 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 		if (!PageIsNew(page))
 		{
 			PageSetLSN(page, lsn);
+			/* If FPI apply mask function is defined - apply it to the buffer. */
+			if (RmgrTable[rmid].rm_fpi_mask)
+			{
+				RmgrTable[rmid].rm_fpi_mask(page, blkno);
+			}
 		}
 
 		MarkBufferDirty(*buf);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0c5b87864b..d8b00bf16a 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3982,6 +3982,59 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	}
 }
 
+/*
+ * IsIndexLpDeadAllowed
+ *
+ * Checks is it allowed to set LP_DEAD hint bit for the tuple in index.
+ */
+IndexLpDeadAllowedResult
+IsIndexLpDeadAllowed(TupleDeadnessData *deadness,
+					 XLogRecPtr *minLsn)
+{
+	*minLsn = InvalidXLogRecPtr;
+	if (!deadness->all_dead)
+		return INDEX_LP_DEAD_NOT_OK;
+	/* It is always allowed on primary if *all_dead. */
+	if (!RecoveryInProgress())
+		return INDEX_LP_DEAD_OK;
+
+	if (TransactionIdIsValid(deadness->latest_removed_xid)) {
+		/*
+		 * If latest_removed_xid is known - make sure its commit record
+		 * less than minRecoveryPoint to avoid MVCC failure after crash recovery.
+		 */
+		XLogRecPtr commitLSN
+				= TransactionIdGetCommitLSN(deadness->latest_removed_xid);
+
+		if (XLogNeedsFlush(commitLSN))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = commitLSN;
+			return INDEX_LP_DEAD_OK_MIN_LSN;
+		}
+		else return INDEX_LP_DEAD_OK;
+	} else {
+		/*
+		 * Looks like it is tuple cleared by heap_page_prune_execute,
+		 * we must be sure if LSN of XLOG_HEAP2_CLEAN (or any subsequent
+		 * updates) less than minRecoveryPoint to avoid MVCC failure
+		 * after crash recovery.
+		 *
+		 * Another possible case is transaction rollback or tuple updated
+		 * by inserting transaction. Such tuple never will be seen, so it
+		 * is safe to set LP_DEAD. It is related to the logic of
+		 * HeapTupleHeaderAdvanceLatestRemovedXid.
+		 */
+		if (XLogNeedsFlush(deadness->page_lsn))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = deadness->page_lsn;
+			return INDEX_LP_DEAD_OK_MIN_LSN;
+		}
+		else return INDEX_LP_DEAD_OK;
+	}
+}
+
 /*
  * Release buffer content locks for shared buffers.
  *
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 59ebac7d6a..2efcb887d0 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -28,7 +28,7 @@
  * RmgrNames is an array of resource manager names, to make error messages
  * a bit nicer.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
   name,
 
 static const char *RmgrNames[RM_MAX_ID + 1] = {
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..fd3bdec530 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -32,7 +32,7 @@
 #include "storage/standbydefs.h"
 #include "utils/relmapper.h"
 
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	{ name, desc, identify},
 
 const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 4b06575d98..d3f7960f7f 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -50,6 +50,7 @@
 #define F_FOLLOW_RIGHT		(1 << 3)	/* page to the right has no downlink */
 #define F_HAS_GARBAGE		(1 << 4)	/* some tuples on the page are dead,
 										 * but not deleted yet */
+#define F_LP_SAFE_ON_STANDBY	(1 << 5) /* LP bits are safe to use on standby */
 
 /*
  * NSN (node sequence number) is a special-purpose LSN which is stored on each
@@ -179,6 +180,10 @@ typedef struct GISTENTRY
 #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE)
 #define GistClearPageHasGarbage(page)	( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE)
 
+#define GistPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags & F_LP_SAFE_ON_STANDBY)
+#define GistMarkPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags |= F_LP_SAFE_ON_STANDBY)
+#define GistClearPageHasLpSafeOnStandby(page)	( GistPageGetOpaque(page)->flags &= ~F_LP_SAFE_ON_STANDBY)
+
 #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT)
 #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT)
 #define GistClearFollowRight(page)	( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT)
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index fd5144f258..fbac95999b 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -110,5 +110,6 @@ extern const char *gist_identify(uint8 info);
 extern void gist_xlog_startup(void);
 extern void gist_xlog_cleanup(void);
 extern void gist_mask(char *pagedata, BlockNumber blkno);
+extern void gist_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 1cce865be2..f57401d484 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -59,6 +59,7 @@ typedef uint32 Bucket;
 #define LH_BUCKET_BEING_SPLIT	(1 << 5)
 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
+#define LH_LP_SAFE_ON_STANDBY	(1 << 8)
 
 #define LH_PAGE_TYPE \
 	(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
@@ -89,6 +90,7 @@ typedef HashPageOpaqueData *HashPageOpaque;
 #define H_BUCKET_BEING_SPLIT(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
 #define H_BUCKET_BEING_POPULATED(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
 #define H_HAS_DEAD_TUPLES(opaque)		(((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
+#define H_LP_SAFE_ON_STANDBY(opaque)	(((opaque)->hasho_flag & LH_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * The page ID is for the convenience of pg_filedump and similar utilities,
diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h
index 4353a32dbb..37bc96d391 100644
--- a/src/include/access/hash_xlog.h
+++ b/src/include/access/hash_xlog.h
@@ -263,5 +263,6 @@ extern void hash_redo(XLogReaderState *record);
 extern void hash_desc(StringInfo buf, XLogReaderState *record);
 extern const char *hash_identify(uint8 info);
 extern void hash_mask(char *pagedata, BlockNumber blkno);
+extern void hash_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* HASH_XLOG_H */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e63b49fc38..7e70b42ef4 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -136,7 +136,7 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 					   HeapTuple tuple, Buffer *userbuf);
 extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
 								   Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
-								   bool *all_dead, bool first_call);
+								   TupleDeadnessData *deadness, bool first_call);
 
 extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid);
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index a645c42e68..dc274ef2f2 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -80,6 +80,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
 #define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageData */
+#define BTP_LP_SAFE_ON_STANDBY (1 << 9) /* LP bits are safe to use on standby */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -225,6 +226,7 @@ typedef struct BTMetaPageData
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 #define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+#define P_LP_SAFE_ON_STANDBY(opaque) (((opaque)->btpo_flags & BTP_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * BTDeletedPageData is the page contents of a deleted page
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 0f7731856b..366acc05b7 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -347,5 +347,6 @@ extern const char *btree_identify(uint8 info);
 extern void btree_xlog_startup(void);
 extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
+extern void btree_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 17a161c69a..823ee31ad0 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -126,9 +126,10 @@ typedef struct IndexScanDescData
 
 	/* signaling to index AM about killing index tuples */
 	bool		kill_prior_tuple;	/* last-returned tuple is dead */
-	bool		ignore_killed_tuples;	/* do not return killed entries */
-	bool		xactStartedInRecovery;	/* prevents killing/seeing killed
-										 * tuples */
+	XLogRecPtr	kill_prior_tuple_min_lsn; /* kill_prior_tuple additionally
+										   * requires index page lsn */
+	bool		xactStartedInRecovery;	/* prevents ignoring tuples
+										 * killed by primary */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
@@ -188,4 +189,12 @@ typedef struct SysScanDescData
 	struct TupleTableSlot *slot;
 }			SysScanDescData;
 
+/* Struct for data about visibility of tuple */
+typedef struct TupleDeadnessData
+{
+	bool			all_dead;			/* guaranteed not visible for all backends */
+	TransactionId	latest_removed_xid;	/* latest removed xid if known */
+	XLogRecPtr		page_lsn;			/* lsn of page where dead tuple located */
+}			TupleDeadnessData;
+
 #endif							/* RELSCAN_H */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index c9b5c56a4c..8e322b0b7f 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -19,7 +19,7 @@ typedef uint8 RmgrId;
  * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG
  * file format.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	symname,
 
 typedef enum RmgrIds
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index f582cf535f..13440a2883 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -24,26 +24,26 @@
  * Changes to this list possibly need an XLOG_PAGE_MAGIC bump.
  */
 
-/* symbol name, textual name, redo, desc, identify, startup, cleanup */
-PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
-PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
-PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
-PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
-PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
-PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
-PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
-PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
-PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
-PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
-PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
-PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
-PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
-PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
-PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
-PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+/* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, fpi_mask */
+PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, btree_fpi_mask)
+PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, hash_fpi_mask)
+PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL)
+PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, gist_fpi_mask)
+PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL)
+PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL)
+PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL)
+PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, NULL)
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 9f1e4a1ac9..fa856da715 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -423,7 +423,7 @@ typedef struct TableAmRoutine
 	 * needs to be set to true by index_fetch_tuple, signaling to the caller
 	 * that index_fetch_tuple should be called again for the same tid.
 	 *
-	 * *all_dead, if all_dead is not NULL, should be set to true by
+	 * *deadness, if value is not NULL, should be filled by
 	 * index_fetch_tuple iff it is guaranteed that no backend needs to see
 	 * that tuple. Index AMs can use that to avoid returning that tid in
 	 * future searches.
@@ -432,7 +432,8 @@ typedef struct TableAmRoutine
 									  ItemPointer tid,
 									  Snapshot snapshot,
 									  TupleTableSlot *slot,
-									  bool *call_again, bool *all_dead);
+									  bool *call_again,
+									  TupleDeadnessData *deadness);
 
 
 	/* ------------------------------------------------------------------------
@@ -1194,7 +1195,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
  * will be set to true, signaling that table_index_fetch_tuple() should be called
  * again for the same tid.
  *
- * *all_dead, if all_dead is not NULL, will be set to true by
+ * *deadness, if value is not NULL, will be filled by
  * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
  * that tuple. Index AMs can use that to avoid returning that tid in future
  * searches.
@@ -1211,7 +1212,8 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 						ItemPointer tid,
 						Snapshot snapshot,
 						TupleTableSlot *slot,
-						bool *call_again, bool *all_dead)
+						bool *call_again,
+						TupleDeadnessData *deadness)
 {
 	/*
 	 * We don't expect direct calls to table_index_fetch_tuple with valid
@@ -1223,7 +1225,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 
 	return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
 													slot, call_again,
-													all_dead);
+													deadness);
 }
 
 /*
@@ -1235,7 +1237,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 extern bool table_index_fetch_tuple_check(Relation rel,
 										  ItemPointer tid,
 										  Snapshot snapshot,
-										  bool *all_dead);
+										  TupleDeadnessData *deadness);
 
 
 /* ------------------------------------------------------------------------
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 26a743b6b6..96cbc23a50 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -292,6 +292,9 @@ typedef enum
  * rm_mask takes as input a page modified by the resource manager and masks
  * out bits that shouldn't be flagged by wal_consistency_checking.
  *
+ * rm_fpi_mask takes FPI buffer and applies access specific non-logged changes,
+ * for example - marks LP_DEAD bits on index page as non-safe for standby.
+ *
  * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h).
  */
 typedef struct RmgrData
@@ -303,6 +306,7 @@ typedef struct RmgrData
 	void		(*rm_startup) (void);
 	void		(*rm_cleanup) (void);
 	void		(*rm_mask) (char *pagedata, BlockNumber blkno);
+	void		(*rm_fpi_mask) (char *pagedata, BlockNumber blkno);
 } RmgrData;
 
 extern const RmgrData RmgrTable[];
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index aa64fb42ec..d1118d118e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -46,6 +46,13 @@ typedef enum
 								 * replay; otherwise same as RBM_NORMAL */
 } ReadBufferMode;
 
+typedef enum
+{
+	INDEX_LP_DEAD_OK,			/* Index tuple could be marked as LP_DEAD */
+	INDEX_LP_DEAD_NOT_OK,		/* Not allowed to mark index tuple as dead */
+	INDEX_LP_DEAD_OK_MIN_LSN	/* Allowed if index page LSN is greater */
+} IndexLpDeadAllowedResult;
+
 /*
  * Type returned by PrefetchBuffer().
  */
@@ -61,6 +68,8 @@ struct WritebackContext;
 /* forward declared, to avoid including smgr.h here */
 struct SMgrRelationData;
 
+struct TupleDeadnessData;
+
 /* in globals.c ... this duplicates miscadmin.h */
 extern PGDLLIMPORT int NBuffers;
 
@@ -224,6 +233,8 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 						 ForkNumber *forknum, BlockNumber *blknum);
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
+extern IndexLpDeadAllowedResult IsIndexLpDeadAllowed(struct TupleDeadnessData *deadness,
+													 XLogRecPtr *minLsn);
 
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
-- 
2.25.1

