From ee66907ed8a3af73ec974347dadab433d1a0a80f Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 31 Aug 2023 18:15:35 -0400
Subject: [PATCH v2 2/2] Reuse heap_page_prune() tuple visibility statuses

heap_page_prune() obtains the HTSV_Result (tuple visibility status)
returned from HeapTupleSatisfiesVacuum() for every tuple on the page and
stores them in an array. By making this array available to
heap_page_prune()'s caller lazy_scan_prune(), we can avoid an additional
call to HeapTupleSatisfiesVacuum() when freezing the tuples and
recording LP_DEAD items for vacuum. This saves resources and eliminates
the possibility that vacuuming corrupted data results in a hang due to
endless retry looping.

This replaces the retry mechanism introduced in 8523492d4 to handle cases in
which a tuple's inserting transaction aborted between the visibility check in
heap_page_prune() and lazy_scan_prune()'s call to HeapTupleSatisfiesVacuum() --
rendering it dead but without a dead line pointer. We can instead reuse the
tuple's original visibility status, circumventing any disagreements.

We pass the HTSV array from heap_page_prune() back to lazy_scan_prune() in
PruneResult. It is convenient to move heap_page_prune()'s other output
parameters into the PruneResult.

Discussion: https://postgr.es/m/CAAKRu_br124qsGJieuYA0nGjywEukhK1dKBfRdby_4yY3E9SXA%40mail.gmail.com
---
 src/backend/access/heap/pruneheap.c  | 66 +++++++++++++++-------------
 src/backend/access/heap/vacuumlazy.c | 54 +++++++----------------
 src/include/access/heapam.h          | 14 +++++-
 3 files changed, 64 insertions(+), 70 deletions(-)

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 47b9e20915..512f62e27d 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -65,16 +65,6 @@ typedef struct
 	 * 1. Otherwise every access would need to subtract 1.
 	 */
 	bool		marked[MaxHeapTuplesPerPage + 1];
-
-	/*
-	 * Tuple visibility is only computed once for each tuple, for correctness
-	 * and efficiency reasons; see comment in heap_page_prune() for details.
-	 * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
-	 * indicate no visibility has been computed, e.g. for LP_DEAD items.
-	 *
-	 * Same indexing as ->marked.
-	 */
-	int8		htsv[MaxHeapTuplesPerPage + 1];
 } PruneState;
 
 /* Local functions */
@@ -83,7 +73,7 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
 											   Buffer buffer);
 static int	heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
-							 PruneState *prstate);
+							 PruneState *prstate, PruneResult *presult);
 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
 static void heap_prune_record_redirect(PruneState *prstate,
 									   OffsetNumber offnum, OffsetNumber rdoffnum);
@@ -191,6 +181,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 
 	if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
 	{
+		PruneResult presult;
+
 		/* OK, try to get exclusive buffer lock */
 		if (!ConditionalLockBufferForCleanup(buffer))
 			return;
@@ -202,11 +194,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 		 */
 		if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
 		{
-			int			ndeleted,
-						nnewlpdead;
+			int			ndeleted;
 
 			ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
-									   limited_ts, &nnewlpdead, NULL);
+									   limited_ts, &presult, NULL);
 
 			/*
 			 * Report the number of tuples reclaimed to pgstats.  This is
@@ -222,9 +213,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 			 * tracks ndeleted, since it will set the same LP_DEAD items to
 			 * LP_UNUSED separately.
 			 */
-			if (ndeleted > nnewlpdead)
+			if (ndeleted > presult.nnewlpdead)
 				pgstat_update_heap_dead_tuples(relation,
-											   ndeleted - nnewlpdead);
+											   ndeleted - presult.nnewlpdead);
 		}
 
 		/* And release buffer lock */
@@ -253,12 +244,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
  * either have been set by TransactionIdLimitedForOldSnapshots, or
  * InvalidTransactionId/0 respectively.
  *
- * Sets *nnewlpdead for caller, indicating the number of items that were
- * newly set LP_DEAD during prune operation.
- *
  * off_loc is the offset location required by the caller to use in error
  * callback.
  *
+ * presult contains output parameters relevant to the caller.
+ * For example, sets presult->nnewlpdead for caller, indicating the number of
+ * items that were newly set LP_DEAD during prune operation.
+ *
  * Returns the number of tuples deleted from the page during this call.
  */
 int
@@ -266,7 +258,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 				GlobalVisState *vistest,
 				TransactionId old_snap_xmin,
 				TimestampTz old_snap_ts,
-				int *nnewlpdead,
+				PruneResult *presult,
 				OffsetNumber *off_loc)
 {
 	int			ndeleted = 0;
@@ -301,6 +293,19 @@ heap_page_prune(Relation relation, Buffer buffer,
 	maxoff = PageGetMaxOffsetNumber(page);
 	tup.t_tableOid = RelationGetRelid(prstate.rel);
 
+	/* Initialize PruneResult */
+	presult->hastup = false;
+	presult->has_lpdead_items = false;
+	presult->all_visible = true;
+	presult->all_frozen = true;
+	presult->visibility_cutoff_xid = InvalidTransactionId;
+
+	/*
+	 * nnewlpdead only includes those items which were newly set to LP_DEAD
+	 * during pruning.
+	 */
+	presult->nnewlpdead = 0;
+
 	/*
 	 * Determine HTSV for all tuples.
 	 *
@@ -331,7 +336,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 		/* Nothing to do if slot doesn't contain a tuple */
 		if (!ItemIdIsNormal(itemid))
 		{
-			prstate.htsv[offnum] = -1;
+			presult->htsv[offnum] = -1;
 			continue;
 		}
 
@@ -347,8 +352,8 @@ heap_page_prune(Relation relation, Buffer buffer,
 		if (off_loc)
 			*off_loc = offnum;
 
-		prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
-														   buffer);
+		presult->htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
+															buffer);
 	}
 
 	/* Scan the page */
@@ -372,7 +377,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 			continue;
 
 		/* Process this item or chain of items */
-		ndeleted += heap_prune_chain(buffer, offnum, &prstate);
+		ndeleted += heap_prune_chain(buffer, offnum, &prstate, presult);
 	}
 
 	/* Clear the offset information once we have processed the given page. */
@@ -473,7 +478,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 	END_CRIT_SECTION();
 
 	/* Record number of newly-set-LP_DEAD items for caller */
-	*nnewlpdead = prstate.ndead;
+	presult->nnewlpdead = prstate.ndead;
 
 	return ndeleted;
 }
@@ -588,7 +593,8 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
  * Returns the number of tuples (to be) deleted from the page.
  */
 static int
-heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
+heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
+				 PruneState *prstate, PruneResult *presult)
 {
 	int			ndeleted = 0;
 	Page		dp = (Page) BufferGetPage(buffer);
@@ -609,7 +615,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 	 */
 	if (ItemIdIsNormal(rootlp))
 	{
-		Assert(prstate->htsv[rootoffnum] != -1);
+		Assert(presult->htsv[rootoffnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
 
 		if (HeapTupleHeaderIsHeapOnly(htup))
@@ -632,7 +638,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
 			 */
-			if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
+			if (presult->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
 				!HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
@@ -700,7 +706,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 			break;
 
 		Assert(ItemIdIsNormal(lp));
-		Assert(prstate->htsv[offnum] != -1);
+		Assert(presult->htsv[offnum] != -1);
 		htup = (HeapTupleHeader) PageGetItem(dp, lp);
 
 		/*
@@ -720,7 +726,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 		 */
 		tupdead = recent_dead = false;
 
-		switch ((HTSV_Result) prstate->htsv[offnum])
+		switch ((HTSV_Result) presult->htsv[offnum])
 		{
 			case HEAPTUPLE_DEAD:
 				tupdead = true;
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 5e0a7422bb..25ed500b45 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1504,7 +1504,7 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
  * tuples, update statistics, and move the relfrozenxid forward.
  *
  * presult is an output parameter initialized and updated exclusively by
- * lazy_scan_prune().
+ * lazy_scan_prune() and heap_page_prune().
  *
  * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
  * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
@@ -1514,12 +1514,12 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
  * of complexity just so we could deal with tuples that were DEAD to VACUUM,
  * but nevertheless were left with storage after pruning.
  *
- * The approach we take now is to restart pruning when the race condition is
- * detected.  This allows heap_page_prune() to prune the tuples inserted by
- * the now-aborted transaction.  This is a little crude, but it guarantees
- * that any items that make it into the dead_items array are simple LP_DEAD
- * line pointers, and that every remaining item with tuple storage is
- * considered as a candidate for freezing.
+ * As of Postgres 17, we circumvent this problem altogether by resusing the
+ * result of heap_page_prune()'s visibility check. Without the second call to
+ * HeapTupleSatisfiesVacuum(), there is no new HTSV_Result and there can be no
+ * disagreement. The tuple's TID won't be added to the array of dead tuple TIDs
+ * for vacuum, thus vacuum will still never be tasked with reaping a tuple with
+ * storage.
  */
 static void
 lazy_scan_prune(LVRelState *vacrel,
@@ -1532,14 +1532,11 @@ lazy_scan_prune(LVRelState *vacrel,
 	OffsetNumber offnum,
 				maxoff;
 	ItemId		itemid;
-	HeapTupleData tuple;
-	HTSV_Result res;
 	int			tuples_deleted,
 				tuples_frozen,
 				lpdead_items,
 				live_tuples,
 				recently_dead_tuples;
-	int			nnewlpdead;
 	HeapPageFreeze pagefrz;
 	int64		fpi_before = pgWalUsage.wal_fpi;
 	OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
@@ -1554,8 +1551,6 @@ lazy_scan_prune(LVRelState *vacrel,
 	 */
 	maxoff = PageGetMaxOffsetNumber(page);
 
-retry:
-
 	/* Initialize (or reset) page-level state */
 	pagefrz.freeze_required = false;
 	pagefrz.FreezePageRelfrozenXid = vacrel->NewRelfrozenXid;
@@ -1578,23 +1573,19 @@ retry:
 	 * that were deleted from indexes.
 	 */
 	tuples_deleted = heap_page_prune(rel, buf, vacrel->vistest,
-									 InvalidTransactionId, 0, &nnewlpdead,
+									 InvalidTransactionId, 0,
+									 presult,
 									 &vacrel->offnum);
 
 	/*
 	 * Now scan the page to collect LP_DEAD items and check for tuples
 	 * requiring freezing among remaining tuples with storage
 	 */
-	presult->hastup = false;
-	presult->has_lpdead_items = false;
-	presult->all_visible = true;
-	presult->all_frozen = true;
-	presult->visibility_cutoff_xid = InvalidTransactionId;
-
 	for (offnum = FirstOffsetNumber;
 		 offnum <= maxoff;
 		 offnum = OffsetNumberNext(offnum))
 	{
+		HeapTupleHeader htup;
 		bool		totally_frozen;
 
 		/*
@@ -1637,22 +1628,7 @@ retry:
 
 		Assert(ItemIdIsNormal(itemid));
 
-		ItemPointerSet(&(tuple.t_self), blkno, offnum);
-		tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
-		tuple.t_len = ItemIdGetLength(itemid);
-		tuple.t_tableOid = RelationGetRelid(rel);
-
-		/*
-		 * DEAD tuples are almost always pruned into LP_DEAD line pointers by
-		 * heap_page_prune(), but it's possible that the tuple state changed
-		 * since heap_page_prune() looked.  Handle that here by restarting.
-		 * (See comments at the top of function for a full explanation.)
-		 */
-		res = HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
-									   buf);
-
-		if (unlikely(res == HEAPTUPLE_DEAD))
-			goto retry;
+		htup = (HeapTupleHeader) PageGetItem(page, itemid);
 
 		/*
 		 * The criteria for counting a tuple as live in this block need to
@@ -1673,7 +1649,7 @@ retry:
 		 * (Cases where we bypass index vacuuming will violate this optimistic
 		 * assumption, but the overall impact of that should be negligible.)
 		 */
-		switch (res)
+		switch (presult->htsv[offnum])
 		{
 			case HEAPTUPLE_LIVE:
 
@@ -1695,7 +1671,7 @@ retry:
 				{
 					TransactionId xmin;
 
-					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
+					if (!HeapTupleHeaderXminCommitted(htup))
 					{
 						presult->all_visible = false;
 						break;
@@ -1705,7 +1681,7 @@ retry:
 					 * The inserter definitely committed. But is it old enough
 					 * that everyone sees it as committed?
 					 */
-					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
+					xmin = HeapTupleHeaderGetXmin(htup);
 					if (!TransactionIdPrecedes(xmin,
 											   vacrel->cutoffs.OldestXmin))
 					{
@@ -1759,7 +1735,7 @@ retry:
 		presult->hastup = true; /* page makes rel truncation unsafe */
 
 		/* Tuple with storage -- consider need to freeze */
-		if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs, &pagefrz,
+		if (heap_prepare_freeze_tuple(htup, &vacrel->cutoffs, &pagefrz,
 									  &frozen[tuples_frozen], &totally_frozen))
 		{
 			/* Save prepared freeze plan for later */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 9a07828e5a..bf94bda634 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -199,6 +199,8 @@ typedef struct PruneResult
 	bool		hastup;			/* Page prevents rel truncation? */
 	bool		has_lpdead_items;	/* includes existing LP_DEAD items */
 
+	int			nnewlpdead;		/* Number of newly LP_DEAD items */
+
 	/*
 	 * State describes the proper VM bit states to set for the page following
 	 * pruning and freezing.  all_visible implies !has_lpdead_items, but don't
@@ -207,6 +209,16 @@ typedef struct PruneResult
 	bool		all_visible;	/* Every item visible to all? */
 	bool		all_frozen;		/* provided all_visible is also true */
 	TransactionId visibility_cutoff_xid;	/* For recovery conflicts */
+
+	/*
+	 * Tuple visibility is only computed once for each tuple, for correctness
+	 * and efficiency reasons; see comment in heap_page_prune() for details.
+	 * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
+	 * indicate no visibility has been computed, e.g. for LP_DEAD items.
+	 *
+	 * Same indexing as ->marked.
+	 */
+	int8		htsv[MaxHeapTuplesPerPage + 1];
 } PruneResult;
 
 
@@ -307,7 +319,7 @@ extern int	heap_page_prune(Relation relation, Buffer buffer,
 							struct GlobalVisState *vistest,
 							TransactionId old_snap_xmin,
 							TimestampTz old_snap_ts,
-							int *nnewlpdead,
+							PruneResult *presult,
 							OffsetNumber *off_loc);
 extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
-- 
2.37.2

