From 1271da941a9b093abb4565ddb2893f23eafbb33b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 17 Oct 2024 14:05:15 -0400
Subject: [PATCH v1 09/14] heapam: Acquire right to set hint bits

This commit starts to use BufferPrepareToSetHintBits(), introduced in a
previous commit, for the reasons explained in said commit.

To amortize the cost of BufferPrepareToSetHintBits() for cases where hint bits
are set at a high frequency, HeapTupleSatisfiesMVCCBatch() uses the new
SetHintBitsExt() which defers BufferFinishSetHintBits() until all hint bits on
a page have been set.

It's likely worth introducing additional batch visibility routines, e.g. for
vacuuming, but I did not find a regression with the state as of this
commit. So that's left for later.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/access/heap/heapam_visibility.c | 116 +++++++++++++++++---
 src/tools/pgindent/typedefs.list            |   1 +
 2 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index 4baafc01a2c..61fbac6e7f5 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -80,10 +80,35 @@
 
 
 /*
- * SetHintBits()
+ * To be allowed to set hint bits SetHintBits() needs to call
+ * BufferPrepareToSetHintBits(). However, that's not free, and some callsites
+ * call SetHintBits() on numerous tuples in a row. For those it makes sense to
+ * amortize the cost of BufferPrepareToSetHintBits(). Additionally it's
+ * desirable to defer the cost of BufferPrepareToSetHintBits() until a hint
+ * bit needs to actually be set. This enum serves as the necessary state space
+ * passed to SetHintbitsExt().
+ */
+typedef enum SetHintBitsState
+{
+	/* not yet checked if hint bits may be set */
+	SHB_INITIAL,
+	/* failed to get permission to set hint bits, don't check again */
+	SHB_DISABLED,
+	/* allowed to set hint bits */
+	SHB_ENABLED,
+} SetHintBitsState;
+
+/*
+ * SetHintBitsExt()
  *
  * Set commit/abort hint bits on a tuple, if appropriate at this time.
  *
+ * To be allowed to set a hint bit on a tuple, the page must not be undergoing
+ * IO at this time (otherwise we e.g. could corrupt PG's page checksum or even
+ * the filesystem's, as is known to happen with btrfs). The right to set a
+ * hint bit is acquired on a page level with BufferPrepareToSetHintBits().
+ * Only a single backend gets the right to set hint bits at a time.
+ *
  * It is only safe to set a transaction-committed hint bit if we know the
  * transaction's commit record is guaranteed to be flushed to disk before the
  * buffer, or if the table is temporary or unlogged and will be obliterated by
@@ -111,9 +136,45 @@
  * InvalidTransactionId if no check is needed.
  */
 static inline void
-SetHintBits(HeapTupleHeader tuple, Buffer buffer,
-			uint16 infomask, TransactionId xid)
+SetHintBitsExt(HeapTupleHeader tuple, Buffer buffer,
+			   uint16 infomask, TransactionId xid, SetHintBitsState *state)
 {
+	/*
+	 * In batched mode and we previously did not get permission to set hint
+	 * bits. Don't try again, in all likelihood IO is still going on.
+	 */
+	if (state && *state == SHB_DISABLED)
+		return;
+
+	/*
+	 * If not batching or this is the first hint that we'd like to set on the
+	 * page, check if we are allowed to do so.
+	 *
+	 * XXX: Should we defer this until after the TransactionIdIsValid() check
+	 * below?
+	 */
+	if ((!state || *state == SHB_INITIAL))
+	{
+		if (BufferPrepareToSetHintBits(buffer))
+		{
+			if (state)
+				*state = SHB_ENABLED;
+		}
+		else
+		{
+			/*
+			 * If we hold an exclusive lock nobody else should be able to
+			 * prevent us from setting hint bits. This is important as there
+			 * are a few hint bit sets that are important for correctness -
+			 * all those happen with the buffer exclusively locked.
+			 */
+			Assert(!BufferLockHeldByMe(buffer, BUFFER_LOCK_EXCLUSIVE));
+			if (state)
+				*state = SHB_DISABLED;
+			return;
+		}
+	}
+
 	if (TransactionIdIsValid(xid))
 	{
 		/* NB: xid must be known committed here! */
@@ -123,12 +184,27 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
 			BufferGetLSNAtomic(buffer) < commitLSN)
 		{
 			/* not flushed and no LSN interlock, so don't set hint */
-			return;
+			goto out;
 		}
 	}
 
 	tuple->t_infomask |= infomask;
 	MarkBufferDirtyHint(buffer, true);
+
+out:
+	if (!state)
+		BufferFinishSetHintBits(buffer);
+}
+
+/*
+ * Simple wrapper around SetHintBitExt(), use when operating on a single
+ * tuple.
+ */
+static inline void
+SetHintBits(HeapTupleHeader tuple, Buffer buffer,
+			uint16 infomask, TransactionId xid)
+{
+	SetHintBitsExt(tuple, buffer, infomask, xid, NULL);
 }
 
 /*
@@ -864,9 +940,9 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
  * inserting/deleting transaction was still running --- which was more cycles
  * and more contention on ProcArrayLock.
  */
-static bool
+static inline bool
 HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
-					   Buffer buffer)
+					   Buffer buffer, SetHintBitsState *state)
 {
 	HeapTupleHeader tuple = htup->t_data;
 
@@ -912,8 +988,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
-				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-							InvalidTransactionId);
+				SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID,
+							   InvalidTransactionId, state);
 				return true;
 			}
 
@@ -925,13 +1001,13 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
 		else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot))
 			return false;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-			SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-						HeapTupleHeaderGetRawXmin(tuple));
+			SetHintBitsExt(tuple, buffer, HEAP_XMIN_COMMITTED,
+						   HeapTupleHeaderGetRawXmin(tuple), state);
 		else
 		{
 			/* it must have aborted or crashed */
-			SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-						InvalidTransactionId);
+			SetHintBitsExt(tuple, buffer, HEAP_XMIN_INVALID,
+						   InvalidTransactionId, state);
 			return false;
 		}
 	}
@@ -994,14 +1070,14 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
 		if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 		{
 			/* it must have aborted or crashed */
-			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-						InvalidTransactionId);
+			SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID,
+						   InvalidTransactionId, state);
 			return true;
 		}
 
 		/* xmax transaction committed */
-		SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-					HeapTupleHeaderGetRawXmax(tuple));
+		SetHintBitsExt(tuple, buffer, HEAP_XMAX_COMMITTED,
+					   HeapTupleHeaderGetRawXmax(tuple), state);
 	}
 	else
 	{
@@ -1597,6 +1673,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer,
 							OffsetNumber *vistuples_dense)
 {
 	int			nvis = 0;
+	SetHintBitsState state = SHB_INITIAL;
 
 	Assert(IsMVCCSnapshot(snapshot));
 
@@ -1609,7 +1686,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer,
 		HeapTuple	tup = &tuples[i];
 #endif
 
-		valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer);
+		valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer, &state);
 #ifdef BATCHMVCC_FEWER_ARGS
 		batchmvcc->visible[i] = valid;
 #else
@@ -1622,6 +1699,9 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer,
 		}
 	}
 
+	if (state == SHB_ENABLED)
+		BufferFinishSetHintBits(buffer);
+
 	return nvis;
 }
 
@@ -1641,7 +1721,7 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
 	switch (snapshot->snapshot_type)
 	{
 		case SNAPSHOT_MVCC:
-			return HeapTupleSatisfiesMVCC(htup, snapshot, buffer);
+			return HeapTupleSatisfiesMVCC(htup, snapshot, buffer, NULL);
 		case SNAPSHOT_SELF:
 			return HeapTupleSatisfiesSelf(htup, snapshot, buffer);
 		case SNAPSHOT_ANY:
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 15c7673465b..04b4f5cb240 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2604,6 +2604,7 @@ SetConstraintStateData
 SetConstraintTriggerData
 SetExprState
 SetFunctionReturnMode
+SetHintBitsState
 SetOp
 SetOpCmd
 SetOpPath
-- 
2.46.0.519.g2e7b89e038

