From 04b07d0627ec65ba3327dc8338d59dbd15c405d8 Mon Sep 17 00:00:00 2001
From: Yura Sokolov <y.sokolov@postgrespro.ru>
Date: Mon, 21 Feb 2022 08:49:03 +0300
Subject: [PATCH v3] [PGPRO-5616] bufmgr: do not acquire two partition locks.

Acquiring two partition locks leads to complex dependency chain that hurts
at high concurrency level.

There is no need to hold both lock simultaneously. Buffer is pinned so
other processes could not select it for eviction. If tag is cleared and
buffer removed from old partition other processes will not find it.
Therefore it is safe to release old partition lock before acquiring
new partition lock.

Tags: lwlock_numa
---
 src/backend/storage/buffer/bufmgr.c | 206 ++++++++++++++--------------
 1 file changed, 105 insertions(+), 101 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b4532948d3f..bb8b1cd2f4b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1114,6 +1114,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferDesc *buf;
 	bool		valid;
 	uint32		buf_state;
+	uint32		new_bits;
 
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -1288,93 +1289,16 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			oldHash = BufTableHashCode(&oldTag);
 			oldPartitionLock = BufMappingPartitionLock(oldHash);
 
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
+			LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 		}
 		else
 		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
 			/* remember we have no old-partition lock or tag */
 			oldPartitionLock = NULL;
 			/* keep the compiler quiet about uninitialized variables */
 			oldHash = 0;
 		}
 
-		/*
-		 * Try to make a hashtable entry for the buffer under its new tag.
-		 * This could fail because while we were writing someone else
-		 * allocated another buffer for the same block we want to read in.
-		 * Note that we have not yet removed the hashtable entry for the old
-		 * tag.
-		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
-
-		if (buf_id >= 0)
-		{
-			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
-			 */
-			UnpinBuffer(buf, true);
-
-			/* Can give up that buffer's mapping partition lock now */
-			if (oldPartitionLock != NULL &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = GetBufferDescriptor(buf_id);
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
-			*foundPtr = true;
-
-			if (!valid)
-			{
-				/*
-				 * We can only get here if (a) someone else is still reading
-				 * in the page, or (b) a previous read attempt failed.  We
-				 * have to wait for any active read attempt to finish, and
-				 * then set up our own read attempt if the page is still not
-				 * BM_VALID.  StartBufferIO does it all.
-				 */
-				if (StartBufferIO(buf, true))
-				{
-					/*
-					 * If we get here, previous attempts to read the buffer
-					 * must have failed ... but we shall bravely try again.
-					 */
-					*foundPtr = false;
-				}
-			}
-
-			return buf;
-		}
-
 		/*
 		 * Need to lock the buffer header too in order to change its tag.
 		 */
@@ -1382,52 +1306,132 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 		/*
 		 * Somebody could have pinned or re-dirtied the buffer while we were
-		 * doing the I/O and making the new hashtable entry.  If so, we can't
-		 * recycle this buffer; we must undo everything we've done and start
-		 * over with a new victim buffer.
+		 * doing the I/O.  If so, we can't recycle this buffer; we must undo
+		 * everything we've done and start over with a new victim buffer.
 		 */
 		oldFlags = buf_state & BUF_FLAG_MASK;
 		if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
 			break;
 
 		UnlockBufHdr(buf, buf_state);
-		BufTableDelete(&newTag, newHash);
-		if (oldPartitionLock != NULL &&
-			oldPartitionLock != newPartitionLock)
+		if (oldPartitionLock != NULL)
 			LWLockRelease(oldPartitionLock);
-		LWLockRelease(newPartitionLock);
 		UnpinBuffer(buf, true);
 	}
 
 	/*
-	 * Okay, it's finally safe to rename the buffer.
+	 * Clear out the buffer's tag and flags.  We must do this to ensure that
+	 * linear scans of the buffer array don't think the buffer is valid. We
+	 * also reset the usage_count since any recency of use of the old content
+	 * is no longer relevant.
 	 *
-	 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
-	 * paranoia.  We also reset the usage_count since any recency of use of
-	 * the old content is no longer relevant.  (The usage_count starts out at
-	 * 1 so that the buffer can survive one clock-sweep pass.)
+	 * We have pinned buffer and we are single pinner at the moment so there
+	 * is no other pinners. We hold buffer header lock and exclusive partition
+	 * lock if tag is valid. Given these statements it is safe to clear tag
+	 * since no other process can inspect it to the moment.
+	 */
+	CLEAR_BUFFERTAG(buf->tag);
+	buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+	UnlockBufHdr(buf, buf_state);
+
+	/* Delete old tag from hash table if it were valid. */
+	if (oldFlags & BM_TAG_VALID)
+		BufTableDelete(&oldTag, oldHash);
+
+	if (oldPartitionLock != newPartitionLock)
+	{
+		if (oldPartitionLock != NULL)
+			LWLockRelease(oldPartitionLock);
+		LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+	}
+
+	/*
+	 * Try to make a hashtable entry for the buffer under its new tag. This
+	 * could fail because while we were writing someone else allocated another
+	 * buffer for the same block we want to read in. In that case we will have
+	 * to return our buffer to free list.
+	 */
+	buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+
+	if (buf_id >= 0)
+	{
+		/*
+		 * Got a collision. Someone has already done what we were about to do.
+		 * We'll just handle this as if it were found in the buffer pool in
+		 * the first place.
+		 */
+
+		/*
+		 * First, give up the buffer we were planning to use and put it to
+		 * free lists.
+		 */
+		UnpinBuffer(buf, true);
+		StrategyFreeBuffer(buf);
+
+		/* remaining code should match code at top of routine */
+
+		buf = GetBufferDescriptor(buf_id);
+
+		valid = PinBuffer(buf, strategy);
+
+		/* Can release the mapping lock as soon as we've pinned it */
+		LWLockRelease(newPartitionLock);
+
+		*foundPtr = true;
+
+		if (!valid)
+		{
+			/*
+			 * We can only get here if (a) someone else is still reading in
+			 * the page, or (b) a previous read attempt failed.  We have to
+			 * wait for any active read attempt to finish, and then set up our
+			 * own read attempt if the page is still not BM_VALID.
+			 * StartBufferIO does it all.
+			 */
+			if (StartBufferIO(buf, true))
+			{
+				/*
+				 * If we get here, previous attempts to read the buffer must
+				 * have failed ... but we shall bravely try again.
+				 */
+				*foundPtr = false;
+			}
+		}
+
+		return buf;
+	}
+
+	/*
+	 * Now it is safe to use victim buffer for new tag.
 	 *
 	 * Make sure BM_PERMANENT is set for buffers that must be written at every
 	 * checkpoint.  Unlogged buffers only need to be written at shutdown
 	 * checkpoints, except for their "init" forks, which need to be treated
 	 * just like permanent relations.
+	 *
+	 * The usage_count starts out at 1 so that the buffer can survive one
+	 * clock-sweep pass.
+	 *
+	 * We use direct atomic OR instead of Lock+Unlock since no other backend
+	 * could be interested in the buffer. But StrategyGetBuffer,
+	 * Flush*Buffers, Drop*Buffers are scanning all buffers and locks them to
+	 * compare tag, and UnlockBufHdr does raw write to state. So we have to
+	 * spin if we found buffer locked.
+	 *
+	 * Note that we write tag unlocked. It is also safe since there is always
+	 * check for BM_VALID when tag is compared.
 	 */
 	buf->tag = newTag;
-	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
-				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
-				   BUF_USAGECOUNT_MASK);
 	if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
-		buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
+		new_bits = BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
 	else
-		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
-
-	UnlockBufHdr(buf, buf_state);
+		new_bits = BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 
-	if (oldPartitionLock != NULL)
+	buf_state = pg_atomic_fetch_or_u32(&buf->state, new_bits);
+	while (unlikely(buf_state & BM_LOCKED))
 	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
+		WaitBufHdrUnlocked(buf);
+		buf_state = pg_atomic_fetch_or_u32(&buf->state, new_bits);
 	}
 
 	LWLockRelease(newPartitionLock);
-- 
2.35.1

