From b91d996c691077a7fa48718c933159136d650e77 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 29 Jun 2023 10:52:56 +1200
Subject: [PATCH 1/2] Improve ReadRecentBuffer() scalability.

While testing a new potential use for ReadRecentBuffer(), Andres
reported that it scales badly when called concurrently for the same
buffer by many backends.  Instead of a naive (but wrong) coding with
PinBuffer(), it used the spinlock, so that it could be careful to pin
only if the buffer was valid and holding the expected block, to avoid
breaking invariants in eg GetVictimBuffer().  Unfortunately that made it
less scalable than PinBuffer(), which uses compare-exchange instead.

We can fix that by giving PinBuffer() a new skip_if_not_valid mode that
doesn't pin invalid buffers.  It might occasionally skip when it
shouldn't due to the unlocked read of the header flags, but that's
unlikely and perfectly acceptable for an opportunistic optimisation
routine, and it can only succeed when it really should due to the
compare-exchange loop.

XXX This also fixes ReadRecentBuffer()'s failure to bump the usage
count.  Fix separately or back-patch this?

Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/20230627020546.t6z4tntmj7wmjrfh%40awork3.anarazel.de
---
 src/backend/storage/buffer/bufmgr.c | 60 ++++++++++++-----------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3c59bbd04e..0df9a1ec30 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -467,7 +467,8 @@ static BlockNumber ExtendBufferedRelShared(ExtendBufferedWhat eb,
 										   BlockNumber extend_upto,
 										   Buffer *buffers,
 										   uint32 *extended_by);
-static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
+static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
+					  bool skip_if_not_valid);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
 static void BufferSync(int flags);
@@ -635,7 +636,6 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 	BufferDesc *bufHdr;
 	BufferTag	tag;
 	uint32		buf_state;
-	bool		have_private_ref;
 
 	Assert(BufferIsValid(recent_buffer));
 
@@ -663,38 +663,17 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 	else
 	{
 		bufHdr = GetBufferDescriptor(recent_buffer - 1);
-		have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
 
-		/*
-		 * Do we already have this buffer pinned with a private reference?  If
-		 * so, it must be valid and it is safe to check the tag without
-		 * locking.  If not, we have to lock the header first and then check.
-		 */
-		if (have_private_ref)
-			buf_state = pg_atomic_read_u32(&bufHdr->state);
-		else
-			buf_state = LockBufHdr(bufHdr);
-
-		if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
+		/* Is it still valid and holding the right tag? */
+		if (PinBuffer(bufHdr, NULL, true))
 		{
-			/*
-			 * It's now safe to pin the buffer.  We can't pin first and ask
-			 * questions later, because it might confuse code paths like
-			 * InvalidateBuffer() if we pinned a random non-matching buffer.
-			 */
-			if (have_private_ref)
-				PinBuffer(bufHdr, NULL);	/* bump pin count */
-			else
-				PinBuffer_Locked(bufHdr);	/* pin for first time */
-
-			pgBufferUsage.shared_blks_hit++;
-
-			return true;
+			if (BufferTagsEqual(&tag, &bufHdr->tag))
+			{
+				pgBufferUsage.shared_blks_hit++;
+				return true;
+			}
+			UnpinBuffer(bufHdr);
 		}
-
-		/* If we locked the header above, now unlock. */
-		if (!have_private_ref)
-			UnlockBufHdr(bufHdr, buf_state);
 	}
 
 	return false;
@@ -1252,7 +1231,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		buf = GetBufferDescriptor(existing_buf_id);
 
-		valid = PinBuffer(buf, strategy);
+		valid = PinBuffer(buf, strategy, false);
 
 		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
@@ -1330,7 +1309,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 		existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
 
-		valid = PinBuffer(existing_buf_hdr, strategy);
+		valid = PinBuffer(existing_buf_hdr, strategy, false);
 
 		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
@@ -1979,7 +1958,7 @@ ExtendBufferedRelShared(ExtendBufferedWhat eb,
 			 * Pin the existing buffer before releasing the partition lock,
 			 * preventing it from being evicted.
 			 */
-			valid = PinBuffer(existing_hdr, strategy);
+			valid = PinBuffer(existing_hdr, strategy, false);
 
 			LWLockRelease(partition_lock);
 
@@ -2225,10 +2204,13 @@ ReleaseAndReadBuffer(Buffer buffer,
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns true if buffer is BM_VALID, else false.  This provision allows
- * some callers to avoid an extra spinlock cycle.
+ * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
+ * true, then a false return value also indicates that the buffer was
+ * (recently) invalid and has not been pinned.
  */
 static bool
-PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
+PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
+		  bool skip_if_not_valid)
 {
 	Buffer		b = BufferDescriptorGetBuffer(buf);
 	bool		result;
@@ -2252,6 +2234,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 			if (old_buf_state & BM_LOCKED)
 				old_buf_state = WaitBufHdrUnlocked(buf);
 
+			if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
+			{
+				ForgetPrivateRefCountEntry(ref);
+				return false;
+			}
+
 			buf_state = old_buf_state;
 
 			/* increase refcount */
-- 
2.40.1

