From f0b9f507fcd8fb9e36856b359794eafdf91c9cd2 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 28 Mar 2024 18:26:06 -0400
Subject: [PATCH v10 01/10] Refactor heap_prune_chain()

Keep track of the number of deleted tuples in PruneState and record this
information when recording a tuple dead, unused or redirected. This
removes a special case from the traversal and chain processing logic as
well as setting a precedent of recording the impact of prune actions in
the record functions themselves. This paradigm will be used in future
commits which move tracking of additional statistics on pruning actions
from lazy_scan_prune() to heap_prune_chain().

Simplify heap_prune_chain()'s chain traversal logic by handling each
case explicitly. That is, do not attempt to share code when processing
different types of chains. For each category of chain, process it
specifically and procedurally: first handling the root, then any
intervening tuples, and, finally, the end of the chain.

While we are at it, add a few new comments to heap_prune_chain()
clarifying some special cases involving RECENTLY_DEAD tuples.

ci-os-only:
---
 src/backend/access/heap/pruneheap.c | 219 +++++++++++++++++-----------
 1 file changed, 130 insertions(+), 89 deletions(-)

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index ef816c2fa9..b3047536f5 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -51,22 +51,24 @@ typedef struct
 	 * 1. Otherwise every access would need to subtract 1.
 	 */
 	bool		marked[MaxHeapTuplesPerPage + 1];
+
+	int			ndeleted;		/* Number of tuples deleted from the page */
 } PruneState;
 
 /* Local functions */
 static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
 											   HeapTuple tup,
 											   Buffer buffer);
-static int	heap_prune_chain(Buffer buffer,
+static void heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
 							 int8 *htsv,
 							 PruneState *prstate);
 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
 static void heap_prune_record_redirect(PruneState *prstate,
-									   OffsetNumber offnum, OffsetNumber rdoffnum);
-static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
-static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum);
-static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
+									   OffsetNumber offnum, OffsetNumber rdoffnum, bool was_normal);
+static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, bool was_normal);
+static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
+static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
 static void page_verify_redirects(Page page);
 
 
@@ -242,6 +244,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 	prstate.snapshotConflictHorizon = InvalidTransactionId;
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
+	prstate.ndeleted = 0;
 
 	/*
 	 * presult->htsv is not initialized here because all ntuple spots in the
@@ -324,8 +327,7 @@ heap_page_prune(Relation relation, Buffer buffer,
 			continue;
 
 		/* Process this item or chain of items */
-		presult->ndeleted += heap_prune_chain(buffer, offnum,
-											  presult->htsv, &prstate);
+		heap_prune_chain(buffer, offnum, presult->htsv, &prstate);
 	}
 
 	/* Clear the offset information once we have processed the given page. */
@@ -398,8 +400,9 @@ heap_page_prune(Relation relation, Buffer buffer,
 
 	END_CRIT_SECTION();
 
-	/* Record number of newly-set-LP_DEAD items for caller */
+	/* Copy data back to 'presult' */
 	presult->nnewlpdead = prstate.ndead;
+	presult->ndeleted = prstate.ndeleted;
 }
 
 
@@ -448,24 +451,25 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
  * to the redirected[] array (two entries per redirection); items to be set to
  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
  * state are added to nowunused[].
- *
- * Returns the number of tuples (to be) deleted from the page.
  */
-static int
+static void
 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 				 int8 *htsv, PruneState *prstate)
 {
-	int			ndeleted = 0;
 	Page		dp = (Page) BufferGetPage(buffer);
 	TransactionId priorXmax = InvalidTransactionId;
 	ItemId		rootlp;
 	HeapTupleHeader htup;
-	OffsetNumber latestdead = InvalidOffsetNumber,
-				maxoff = PageGetMaxOffsetNumber(dp),
+	OffsetNumber maxoff = PageGetMaxOffsetNumber(dp),
 				offnum;
 	OffsetNumber chainitems[MaxHeapTuplesPerPage];
-	int			nchain = 0,
-				i;
+
+	/*
+	 * After traversing the HOT chain, ndeadchain is the index in chainitems
+	 * of the first live successor after the last dead item.
+	 */
+	int			ndeadchain = 0,
+				nchain = 0;
 
 	rootlp = PageGetItemId(dp, rootoffnum);
 
@@ -496,18 +500,29 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 			 * Note that we might first arrive at a dead heap-only tuple
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
+			 *
+			 * Whether we arrive at the dead HOT tuple first here or while
+			 * following a chain below affects whether preceding RECENTLY_DEAD
+			 * tuples in the chain can be removed or not.  Imagine that you
+			 * have a chain with two tuples: RECENTLY_DEAD -> DEAD.  If we
+			 * reach the RECENTLY_DEAD tuple first, the chain-following logic
+			 * will find the DEAD tuple and conclude that both tuples are in
+			 * fact dead and can be removed.  But if we reach the DEAD tuple
+			 * at the end of the chain first, when we reach the RECENTLY_DEAD
+			 * tuple later, we will not follow the chain because the DEAD
+			 * TUPLE is already 'marked', and will not remove the
+			 * RECENTLY_DEAD tuple.  This is not a correctness issue, and the
+			 * RECENTLY_DEAD tuple will be removed by a later VACUUM.
 			 */
 			if (htsv[rootoffnum] == HEAPTUPLE_DEAD &&
 				!HeapTupleHeaderIsHotUpdated(htup))
 			{
-				heap_prune_record_unused(prstate, rootoffnum);
+				heap_prune_record_unused(prstate, rootoffnum, true);
 				HeapTupleHeaderAdvanceConflictHorizon(htup,
 													  &prstate->snapshotConflictHorizon);
-				ndeleted++;
 			}
 
-			/* Nothing more to do */
-			return ndeleted;
+			return;
 		}
 	}
 
@@ -518,8 +533,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 	for (;;)
 	{
 		ItemId		lp;
-		bool		tupdead,
-					recent_dead;
 
 		/* Sanity check (pure paranoia) */
 		if (offnum < FirstOffsetNumber)
@@ -569,7 +582,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 			 * the LP was already marked dead.
 			 */
 			if (unlikely(prstate->mark_unused_now))
-				heap_prune_record_unused(prstate, offnum);
+				heap_prune_record_unused(prstate, offnum, false);
 
 			break;
 		}
@@ -592,20 +605,37 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 		/*
 		 * Check tuple's visibility status.
 		 */
-		tupdead = recent_dead = false;
-
 		switch (htsv_get_valid_status(htsv[offnum]))
 		{
 			case HEAPTUPLE_DEAD:
-				tupdead = true;
+
+				/*
+				 * Remember the last DEAD tuple seen.  We will advance past
+				 * RECENTLY_DEAD tuples just in case there's a DEAD one after
+				 * them; but we can't advance past anything else.  We want to
+				 * ensure that any line pointers for DEAD tuples are set
+				 * LP_DEAD or LP_UNUSED. It is important that line pointers
+				 * whose offsets are added to deadoffsets are in fact set
+				 * LP_DEAD.
+				 */
+				ndeadchain = nchain;
+				HeapTupleHeaderAdvanceConflictHorizon(htup,
+													  &prstate->snapshotConflictHorizon);
 				break;
 
 			case HEAPTUPLE_RECENTLY_DEAD:
-				recent_dead = true;
 
 				/*
 				 * This tuple may soon become DEAD.  Update the hint field so
 				 * that the page is reconsidered for pruning in future.
+				 *
+				 * We don't need to advance the conflict horizon for
+				 * RECENTLY_DEAD tuples, even if we are removing them. This is
+				 * because we only remove RECENTLY_DEAD tuples if they precede
+				 * a DEAD tuple, and the DEAD tuple must have been inserted by
+				 * a newer transaction than the RECENTLY_DEAD tuple by virtue
+				 * of being later in the chain. We will have advanced the
+				 * conflict horizon for the DEAD tuple.
 				 */
 				heap_prune_record_prunable(prstate,
 										   HeapTupleHeaderGetUpdateXid(htup));
@@ -619,7 +649,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 				 */
 				heap_prune_record_prunable(prstate,
 										   HeapTupleHeaderGetUpdateXid(htup));
-				break;
 
 			case HEAPTUPLE_LIVE:
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -630,28 +659,12 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 				 * But we don't.  See related decisions about when to mark the
 				 * page prunable in heapam.c.
 				 */
-				break;
+				goto process_chains;
 
 			default:
 				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-				break;
-		}
-
-		/*
-		 * Remember the last DEAD tuple seen.  We will advance past
-		 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
-		 * but we can't advance past anything else.  We have to make sure that
-		 * we don't miss any DEAD tuples, since DEAD tuples that still have
-		 * tuple storage after pruning will confuse VACUUM.
-		 */
-		if (tupdead)
-		{
-			latestdead = offnum;
-			HeapTupleHeaderAdvanceConflictHorizon(htup,
-												  &prstate->snapshotConflictHorizon);
+				goto process_chains;
 		}
-		else if (!recent_dead)
-			break;
 
 		/*
 		 * If the tuple is not HOT-updated, then we are at the end of this
@@ -672,57 +685,58 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum,
 		priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 	}
 
-	/*
-	 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
-	 * the DEAD tuples at the start of the chain are removed and the root line
-	 * pointer is appropriately redirected.
-	 */
-	if (OffsetNumberIsValid(latestdead))
+	if (ItemIdIsRedirected(rootlp) && nchain < 2)
 	{
 		/*
-		 * Mark as unused each intermediate item that we are able to remove
-		 * from the chain.
-		 *
-		 * When the previous item is the last dead tuple seen, we are at the
-		 * right candidate for redirection.
+		 * We found a redirect item that doesn't point to a valid follow-on
+		 * item.  This can happen if the loop in heap_page_prune caused us to
+		 * visit the dead successor of a redirect item before visiting the
+		 * redirect item.  We can clean up by setting the redirect item to
+		 * LP_DEAD state or LP_UNUSED if the caller indicated.
 		 */
-		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
-		{
-			heap_prune_record_unused(prstate, chainitems[i]);
-			ndeleted++;
-		}
+		heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
+		return;
+	}
 
+process_chains:
+
+	if (ndeadchain == 0)
+	{
 		/*
-		 * If the root entry had been a normal tuple, we are deleting it, so
-		 * count it in the result.  But changing a redirect (even to DEAD
-		 * state) doesn't count.
+		 * If no DEAD tuple was found, the chain is entirely composed of
+		 * normal, unchanged tuples, leave it alone.
 		 */
-		if (ItemIdIsNormal(rootlp))
-			ndeleted++;
-
+	}
+	else if (ndeadchain == nchain)
+	{
 		/*
 		 * If the DEAD tuple is at the end of the chain, the entire chain is
-		 * dead and the root line pointer can be marked dead.  Otherwise just
-		 * redirect the root to the correct chain member.
+		 * dead and the root line pointer can be marked dead.
 		 */
-		if (i >= nchain)
-			heap_prune_record_dead_or_unused(prstate, rootoffnum);
-		else
-			heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
+		heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
+
+		for (int i = 1; i < nchain; i++)
+			heap_prune_record_unused(prstate, chainitems[i], true);
 	}
-	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+	else
 	{
 		/*
-		 * We found a redirect item that doesn't point to a valid follow-on
-		 * item.  This can happen if the loop in heap_page_prune caused us to
-		 * visit the dead successor of a redirect item before visiting the
-		 * redirect item.  We can clean up by setting the redirect item to
-		 * DEAD state or LP_UNUSED if the caller indicated.
+		 * If we found a DEAD tuple in the chain, adjust the HOT chain so that
+		 * all the DEAD tuples at the start of the chain are removed and the
+		 * root line pointer is appropriately redirected.
 		 */
-		heap_prune_record_dead_or_unused(prstate, rootoffnum);
-	}
+		heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
+								   ItemIdIsNormal(rootlp));
+
+		/*
+		 * Mark as unused each intermediate item that we are able to remove
+		 * from the chain.
+		 */
+		for (int i = 1; i < ndeadchain; i++)
+			heap_prune_record_unused(prstate, chainitems[i], true);
 
-	return ndeleted;
+		/* the rest of tuples in the chain are normal, unchanged tuples */
+	}
 }
 
 /* Record lowest soon-prunable XID */
@@ -742,7 +756,8 @@ heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
 /* Record line pointer to be redirected */
 static void
 heap_prune_record_redirect(PruneState *prstate,
-						   OffsetNumber offnum, OffsetNumber rdoffnum)
+						   OffsetNumber offnum, OffsetNumber rdoffnum,
+						   bool was_normal)
 {
 	Assert(prstate->nredirected < MaxHeapTuplesPerPage);
 	prstate->redirected[prstate->nredirected * 2] = offnum;
@@ -752,17 +767,34 @@ heap_prune_record_redirect(PruneState *prstate,
 	prstate->marked[offnum] = true;
 	Assert(!prstate->marked[rdoffnum]);
 	prstate->marked[rdoffnum] = true;
+
+	/*
+	 * If the root entry had been a normal tuple, we are deleting it, so count
+	 * it in the result.  But changing a redirect (even to DEAD state) doesn't
+	 * count.
+	 */
+	if (was_normal)
+		prstate->ndeleted++;
 }
 
 /* Record line pointer to be marked dead */
 static void
-heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
+heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
+					   bool was_normal)
 {
 	Assert(prstate->ndead < MaxHeapTuplesPerPage);
 	prstate->nowdead[prstate->ndead] = offnum;
 	prstate->ndead++;
 	Assert(!prstate->marked[offnum]);
 	prstate->marked[offnum] = true;
+
+	/*
+	 * If the root entry had been a normal tuple, we are deleting it, so count
+	 * it in the result.  But changing a redirect (even to DEAD state) doesn't
+	 * count.
+	 */
+	if (was_normal)
+		prstate->ndeleted++;
 }
 
 /*
@@ -772,7 +804,8 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
  * pointers LP_DEAD if mark_unused_now is true.
  */
 static void
-heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum)
+heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
+								 bool was_normal)
 {
 	/*
 	 * If the caller set mark_unused_now to true, we can remove dead tuples
@@ -781,20 +814,28 @@ heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum)
 	 * likely.
 	 */
 	if (unlikely(prstate->mark_unused_now))
-		heap_prune_record_unused(prstate, offnum);
+		heap_prune_record_unused(prstate, offnum, was_normal);
 	else
-		heap_prune_record_dead(prstate, offnum);
+		heap_prune_record_dead(prstate, offnum, was_normal);
 }
 
 /* Record line pointer to be marked unused */
 static void
-heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
+heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
 {
 	Assert(prstate->nunused < MaxHeapTuplesPerPage);
 	prstate->nowunused[prstate->nunused] = offnum;
 	prstate->nunused++;
 	Assert(!prstate->marked[offnum]);
 	prstate->marked[offnum] = true;
+
+	/*
+	 * If the root entry had been a normal tuple, we are deleting it, so count
+	 * it in the result.  But changing a redirect (even to DEAD state) doesn't
+	 * count.
+	 */
+	if (was_normal)
+		prstate->ndeleted++;
 }
 
 
-- 
2.40.1

