[PATCH] introduce XLogLockBlockRangeForCleanup()

Started by Abhijit Menon-Senover 11 years ago11 messages
#1Abhijit Menon-Sen
ams@2ndQuadrant.com
1 attachment(s)

nbtxlog.c:btree_xlog_vacuum() contains the following comment:

* XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is
* not in shared_buffers we confirm it as unpinned.

The attached patch introduces an XLogLockBlockRangeForCleanup() function
that addresses this, as well as a "special call into the buffer manager"
that makes it possible to lock the buffers without reading them.

The patch is by Simon Riggs, with some minor edits and testing by me.

I'm adding the patch to the CommitFest, and will follow up with some
performance numbers soon.

-- Abhijit

Attachments:

XLogLockBlockRangeForCleanup.v3.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..302551f 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,73 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	BufferAccessStrategy bstrategy;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	/*
+	 * We need to use a BufferAccessStrategy because we do not want to
+	 * increment the buffer's usage_count as we read them. However,
+	 * there is no ring buffer associated with this strategy, since we
+	 * never actually read or write the contents, just lock them.
+	 */
+	bstrategy = GetAccessStrategy(BAS_DISCARD);
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno, bstrategy);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+
+	FreeAccessStrategy(bstrategy);
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c070278..2c8d374 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -221,8 +221,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -283,6 +281,54 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum, BufferAccessStrategy strategy)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/* Found it.  Now, try to pin the buffer. */
+		buf = &BufferDescriptors[buf_id];
+
+		valid = PinBuffer(buf, strategy);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
@@ -1076,12 +1122,7 @@ ReleaseAndReadBuffer(Buffer buffer,
  * PinBuffer -- make buffer unavailable for replacement.
  *
  * For the default access strategy, the buffer's usage_count is incremented
- * when we first pin it; for other strategies we just make sure the usage_count
- * isn't zero.  (The idea of the latter is that we don't want synchronized
- * heap scans to inflate the count, but we need it to not be zero to discourage
- * other backends from stealing buffers from our ring.  As long as we cycle
- * through the ring faster than the global clock-sweep cycles, buffers in
- * our ring won't be chosen as victims for replacement by other backends.)
+ * when we first pin it; otherwise we let the Strategy decide what to do.
  *
  * This should be applied only to shared buffers, never local ones.
  *
@@ -1106,10 +1147,7 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 				buf->usage_count++;
 		}
 		else
-		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
-		}
+			StrategySetBufferUsage(strategy, buf);
 		result = (buf->flags & BM_VALID) != 0;
 		UnlockBufHdr(buf);
 	}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 4befab0..c7daf47 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -76,6 +76,11 @@ typedef struct BufferAccessStrategyData
 	bool		current_was_in_ring;
 
 	/*
+	 * How we handle usage_count for this strategy.
+	 */
+	bool		increment_on_zero;
+
+	/*
 	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
 	 * have not yet selected a buffer for this ring slot.  For allocation
 	 * simplicity this is palloc'd together with the fixed fields of the
@@ -408,6 +413,7 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 {
 	BufferAccessStrategy strategy;
 	int			ring_size;
+	bool		increment_on_zero = true;
 
 	/*
 	 * Select ring size to use.  See buffer/README for rationales.
@@ -430,6 +436,10 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 		case BAS_VACUUM:
 			ring_size = 256 * 1024 / BLCKSZ;
 			break;
+		case BAS_DISCARD:
+			ring_size = 0;	/* We musn't ever call GetBufferFromRing() */
+			increment_on_zero = false;
+			break;
 
 		default:
 			elog(ERROR, "unrecognized buffer access strategy: %d",
@@ -437,8 +447,9 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 			return NULL;		/* keep compiler quiet */
 	}
 
-	/* Make sure ring isn't an undue fraction of shared buffers */
-	ring_size = Min(NBuffers / 8, ring_size);
+	if (ring_size > 0)
+		/* Make sure ring isn't an undue fraction of shared buffers */
+		ring_size = Min(NBuffers / 8, ring_size);
 
 	/* Allocate the object and initialize all elements to zeroes */
 	strategy = (BufferAccessStrategy)
@@ -448,6 +459,7 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 	/* Set fields that don't start out zero */
 	strategy->btype = btype;
 	strategy->ring_size = ring_size;
+	strategy->increment_on_zero = increment_on_zero;
 
 	return strategy;
 }
@@ -478,6 +490,8 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	volatile BufferDesc *buf;
 	Buffer		bufnum;
 
+	Assert(strategy->ring_size > 0);
+
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
 		strategy->current = 0;
@@ -563,3 +577,22 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf)
 
 	return true;
 }
+
+/*
+ * Set the usage_count according to the strategy
+ *
+ * We just make sure the usage_count isn't zero. (We don't want
+ * synchronized heap scans to inflate the count, but we need it to not
+ * be zero to discourage other backends from stealing buffers from our
+ * ring. As long as we cycle through the ring faster than the global
+ * clock-sweep cycles, buffers in our ring won't be chosen as victims
+ * for replacement by other backends.)
+ *
+ * Called while holding buffer header spinlock, so make it quick.
+ */
+void
+StrategySetBufferUsage(BufferAccessStrategy strategy, volatile BufferDesc *buf)
+{
+	if (buf->usage_count == 0 && strategy->increment_on_zero)
+		buf->usage_count = 1;
+}
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c019013..720db12 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -190,6 +190,8 @@ extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 extern void StrategyFreeBuffer(volatile BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 volatile BufferDesc *buf);
+extern void StrategySetBufferUsage(BufferAccessStrategy strategy,
+					 volatile BufferDesc *buf);
 
 extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
 extern void StrategyNotifyBgWriter(Latch *bgwriterLatch);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..bdba547 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -29,7 +29,8 @@ typedef enum BufferAccessStrategyType
 	BAS_BULKREAD,				/* Large read-only scan (hint bit updates are
 								 * ok) */
 	BAS_BULKWRITE,				/* Large multi-block write (e.g. COPY IN) */
-	BAS_VACUUM					/* VACUUM */
+	BAS_VACUUM,					/* VACUUM */
+	BAS_DISCARD					/* DISCARD can only be used with GetBuffersWithoutRelcache() */
 } BufferAccessStrategyType;
 
 /* Possible modes for ReadBufferExtended() */
@@ -38,9 +39,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +169,9 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum,
+									   BufferAccessStrategy strategy);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
#2Amit Khandekar
amit.khandekar@enterprisedb.com
In reply to: Abhijit Menon-Sen (#1)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

On 13 June 2014 14:10, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

nbtxlog.c:btree_xlog_vacuum() contains the following comment:

* XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is
* not in shared_buffers we confirm it as unpinned.

The attached patch introduces an XLogLockBlockRangeForCleanup() function
that addresses this, as well as a "special call into the buffer manager"
that makes it possible to lock the buffers without reading them.

In GetBufferWithoutRelcache(), I was wondering, rather than calling
PinBuffer(), if we do this :
LockBufHdr(buf);
PinBuffer_Locked(buf);
valid = ((buf->flags & BM_VALID) != 0);
then we can avoid having the new buffer access strategy BAS_DISCARD that is
introduced in this patch. And so the code changes in freelist.c would not
be necessary.

Will give further thought on the overall logic in
XLogLockBlockRangeForCleanup().

will follow up with some performance numbers soon.

Yes, that would be nice.

-Amit

#3Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Amit Khandekar (#2)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

At 2014-07-03 11:15:53 +0530, amit.khandekar@enterprisedb.com wrote:

In GetBufferWithoutRelcache(), I was wondering, rather than calling
PinBuffer(), if we do this :
LockBufHdr(buf);
PinBuffer_Locked(buf);
valid = ((buf->flags & BM_VALID) != 0);
then we can avoid having the new buffer access strategy BAS_DISCARD
that is introduced in this patch. And so the code changes in
freelist.c would not be necessary.

Thanks for the suggestion. It sounds sensible at first glance. I'll
think about it a little more, then try it and see.

will follow up with some performance numbers soon.

Yes, that would be nice.

I'm afraid I haven't had a chance to do this yet.

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#4Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Abhijit Menon-Sen (#3)
1 attachment(s)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

FYI, I've attached a patch that does what you suggested. I haven't done
anything else (i.e. testing) with it yet.

-- Abhijit

Attachments:

xlog-lock-blockrange-2.difftext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..d84f83a 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..d3c7eb7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -284,6 +282,59 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it. Now, try to pin the buffer, but leave its usage
+		 * count alone.
+		 */
+		buf = &BufferDescriptors[buf_id];
+
+		LockBufHdr(buf);
+		PinBuffer_Locked(buf);
+		valid = (buf->flags & BM_VALID) != 0;
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..b827be0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,9 +38,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
#5Simon Riggs
simon@2ndQuadrant.com
In reply to: Amit Khandekar (#2)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

On 3 July 2014 06:45, Amit Khandekar <amit.khandekar@enterprisedb.com> wrote:

In GetBufferWithoutRelcache(), I was wondering, rather than calling
PinBuffer(), if we do this :
LockBufHdr(buf);
PinBuffer_Locked(buf);
valid = ((buf->flags & BM_VALID) != 0);
then we can avoid having the new buffer access strategy BAS_DISCARD that is
introduced in this patch. And so the code changes in freelist.c would not be
necessary.

That looks like a good idea, thanks.

I think we should say this though

LockBufHdr(buf);
valid = ((buf->flags & BM_VALID) != 0);
if (valid)
PinBuffer_Locked(buf);
else
UnlockBufHdr(buf);

since otherwise we would access the buffer flags without the spinlock
and we would leak a pin if the buffer was not valid

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6Amit Khandekar
amit.khandekar@enterprisedb.com
In reply to: Simon Riggs (#5)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

On 3 July 2014 16:59, Simon Riggs <simon@2ndquadrant.com> wrote:

I think we should say this though

LockBufHdr(buf);
valid = ((buf->flags & BM_VALID) != 0);
if (valid)
PinBuffer_Locked(buf);
else
UnlockBufHdr(buf);

since otherwise we would access the buffer flags without the spinlock
and we would leak a pin if the buffer was not valid

Ah right. That is essential.

#7Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Amit Khandekar (#6)
1 attachment(s)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

Updated patch attached, thanks.

Amit: what's your conclusion from the review?

-- Abhijit

Attachments:

xlog-lock-blockrange-3.difftext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..d84f83a 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..ce5ff70 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -284,6 +282,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it. Now, try to pin the buffer if it's valid, but leave
+		 * its usage count alone.
+		 */
+		buf = &BufferDescriptors[buf_id];
+
+		LockBufHdr(buf);
+		valid = (buf->flags & BM_VALID) != 0;
+		if (valid)
+			PinBuffer_Locked(buf);
+		else
+			UnlockBufHdr(buf);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..b827be0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,9 +38,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
#8Amit Khandekar
amit.khandekar@enterprisedb.com
In reply to: Abhijit Menon-Sen (#7)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

On 4 July 2014 19:11, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

Updated patch attached, thanks.

Amit: what's your conclusion from the review?

Other than some minor comments as mentioned below, I don't have any more
issues, it looks all good.

XLogLockBlockRangeForCleanup() function header comments has the function
name spelled: XLogBlockRangeForCleanup

In GetBufferWithoutRelcache(), we can call BufferDescriptorGetBuffer(buf)
rather than BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]).

Show quoted text

-- Abhijit

#9Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Amit Khandekar (#8)
1 attachment(s)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

At 2014-07-07 14:02:15 +0530, amit.khandekar@enterprisedb.com wrote:

Other than some minor comments as mentioned below, I don't have any
more issues, it looks all good.

Thank you. (Updated patch attached.)

-- Abhijit

Attachments:

xlog-lock-blockrange-4.difftext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	 * here during crash recovery.
 	 */
 	if (HotStandbyActiveInReplay())
-	{
-		BlockNumber blkno;
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
-	}
+		XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1,
+									 xlrec->block);
 
 	/*
 	 * If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..a9aea31 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	return buffer;
 }
 
+/*
+ * XLogLockBlockRangeForCleanup is used in Hot Standby mode to emulate
+ * the locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno,
+							 BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber	endingBlkno;
+	Buffer		buffer;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+}
 
 /*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..f7976a0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -284,6 +282,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it. Now, try to pin the buffer if it's valid, but leave
+		 * its usage count alone.
+		 */
+		buf = &BufferDescriptors[buf_id];
+
+		LockBufHdr(buf);
+		valid = (buf->flags & BM_VALID) != 0;
+		if (valid)
+			PinBuffer_Locked(buf);
+		else
+			UnlockBufHdr(buf);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(buf);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
 extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..b827be0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,9 +38,7 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
#10Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Abhijit Menon-Sen (#9)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

On 07/07/2014 11:46 AM, Abhijit Menon-Sen wrote:

At 2014-07-07 14:02:15 +0530, amit.khandekar@enterprisedb.com wrote:

Other than some minor comments as mentioned below, I don't have any
more issues, it looks all good.

Thank you. (Updated patch attached.)

I don't think the new GetBufferWithoutRelcache function is in line with
the existing ReadBuffer API. I think it would be better to add a new
ReadBufferMode - RBM_CACHE_ONLY? - that only returns the buffer if it's
already in cache, and InvalidBuffer otherwise.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Heikki Linnakangas (#10)
1 attachment(s)
Re: [PATCH] introduce XLogLockBlockRangeForCleanup()

At 2014-08-20 11:07:44 +0300, hlinnakangas@vmware.com wrote:

I don't think the new GetBufferWithoutRelcache function is in line
with the existing ReadBuffer API. I think it would be better to add a
new ReadBufferMode - RBM_CACHE_ONLY? - that only returns the buffer if
it's already in cache, and InvalidBuffer otherwise.

Hi Heikki.

I liked the idea of having a new ReadBufferMode of RBM_CACHE_ONLY, but I
wasn't happy with the result when I tried to modify the code to suit. It
didn't make the code easier for me to follow.

(I'll say straight up that it can be done the way you said, and if the
consensus is that it would be an improvement, I'll do it that way. I'm
just not convinced about it, hence this mail.)

For example:

static Buffer
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
BufferAccessStrategy strategy, bool *hit)
{
volatile BufferDesc *bufHdr;
Block bufBlock;
bool found;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);

*hit = false;

/* Make sure we will have room to remember the buffer pin */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

isExtend = (blockNum == P_NEW);

isLocalBuf and isExtend will both be false for the RBM_CACHE_ONLY case,
which is good for us. But that probably needs to be mentioned in a
comment here.

TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend);

We know we're not going to be doing IO in the RBM_CACHE_ONLY case, but
that's probably OK? I don't understand this TRACE_* stuff. But we need
to either skip this, or also do the corresponding TRACE_*_DONE later.

if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
if (found)
pgBufferUsage.local_blks_hit++;
else
pgBufferUsage.local_blks_read++;
}
else
{
/*
* lookup the buffer. IO_IN_PROGRESS is set if the requested block is
* not currently in memory.
*/
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
if (found)
pgBufferUsage.shared_blks_hit++;
else
pgBufferUsage.shared_blks_read++;
}

The nicest option I could think of here was to copy the shared_buffers
lookup from BufferAlloc into a new BufferAlloc_shared function, and add
a new branch for RBM_CACHE_ONLY. That would look like:

if (isLocalBuf)

else if (mode == RBM_CACHE_ONLY)
{
/*
* We check to see if the buffer is already cached, and if it's
* not, we return InvalidBuffer because we know it's not pinned.
*/
bufHdr = BufferAlloc_shared(…, &found);
if (!found)
return InvalidBuffer;
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
return BufferDescriptorGetBuffer(bufHdr);
}
else
{
/*
* lookup the buffer …
}

…or if we went with the rest of the code and didn't do the lock/return
immediately, we would fall through to this code:

/* At this point we do NOT hold any locks. */

/* if it was already in the buffer pool, we're done */
if (found)
{
if (!isExtend)
{
/* Just need to update stats before we exit */
*hit = true;
VacuumPageHit++;

if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;

TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend,
found);

/*
* In RBM_ZERO_AND_LOCK mode the caller expects the page to be
* locked on return.
*/
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}

return BufferDescriptorGetBuffer(bufHdr);
}

Some of this isn't applicable to RBM_CACHE_ONLY, so we could either skip
it until we reach the RBM_ZERO_AND_CLEANUP_LOCK part, or add a new
branch before the «if (!isExtend)» one.

In summary, we're either adding one new branch that handles everything
related to RBM_CACHE_ONLY and side-stepping some things on the way that
happen to not apply (isLocalBuf, isExtend, TRACE_*), or we're adding a
bunch of tests to ReadBuffer_common.

But splitting BufferAlloc into BufferAlloc_shared and the rest of
BufferAlloc doesn't look especially nice either. If you look at the
shared_buffers lookup from BufferAlloc:

/* see if the block is in the buffer pool already */
LWLockAcquire(newPartitionLock, LW_SHARED);
buf_id = BufTableLookup(&newTag, newHash);
if (buf_id >= 0)
{
/*
* Found it. Now, pin the buffer so no one can steal it from the
* buffer pool, and check to see if the correct data has been loaded
* into the buffer.
*/
buf = GetBufferDescriptor(buf_id);

valid = PinBuffer(buf, strategy);

/* Can release the mapping lock as soon as we've pinned it */
LWLockRelease(newPartitionLock);

*foundPtr = TRUE;

if (!valid)
{
/*
* We can only get here if (a) someone else is still reading in
* the page, or (b) a previous read attempt failed. We have to
* wait for any active read attempt to finish, and then set up our
* own read attempt if the page is still not BM_VALID.
* StartBufferIO does it all.
*/
if (StartBufferIO(buf, true))
{
/*
* If we get here, previous attempts to read the buffer must
* have failed ... but we shall bravely try again.
*/
*foundPtr = FALSE;
}
}

return buf;
}

/*
* Didn't find it in the buffer pool. We'll have to initialize a new
* buffer. Remember to unlock the mapping lock while doing the work.
*/
LWLockRelease(newPartitionLock);

/* Loop here in case we have to try another victim buffer */
for (;;)
{

There are two options here. I split out BufferAlloc_shared and either
make BufferAlloc call it, or not.

If I make BufferAlloc call it, the LWLockAcquire/Release could move to
the _shared function without confusion, but I'll either have to return
the PinBuffer BM_VALID test result via a validPtr, or repeat the test
in the caller before calling StartBufferIO. Both the INIT_BUFFERTAG and
BufTableHashCode() would also have to be in both functions, since they
are used for BufTableInsert() later in BufferAlloc.

On the other hand, if I don't make BufferAlloc call the _shared
function, then we end up with a new few-line special-case function plus
either a self-contained new branch in ReadBuffer_common, or a bunch of
RBM_CACHE_ONLY tests. That doesn't really seem an improvement over what
the original patch did, i.e. introduce a single special-case function to
handle a single special-case access.

I'm open to suggestions.

-- Abhijit

P.S. For reference, I've attached the patch that I posted earlier,
rebased to apply to master.

Attachments:

20150623-xlog-lock-blockrange-5.difftext/x-diff; charset=us-asciiDownload
commit 4cbbbd242b7ee6333e1d1a09794aa1cd9d39035f
Author: Abhijit Menon-Sen <ams@2ndQuadrant.com>
Date:   Tue Jun 23 13:38:01 2015 +0530

    Introduce XLogLockBlockRangeForCleanup()
    
    When replaying B-Tree vacuum records, we need to confirm that a range of
    blocks is unpinned, but there was no way to ask the buffer manager if a
    block was pinned without having it read in uncached pages.
    
    This patch exposes a new interface that returns pages if they're in the
    cache, or InvalidBuffer, and uses that in a function that locks a range
    of blocks without reading them in from disk.

diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 2f85867..e49a9ba 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -416,33 +416,10 @@ btree_xlog_vacuum(XLogReaderState *record)
 	{
 		RelFileNode thisrnode;
 		BlockNumber thisblkno;
-		BlockNumber blkno;
 
 		XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno);
-
-		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++)
-		{
-			/*
-			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
-			 * condition to see all-zero pages.  The original btvacuumpage
-			 * scan would have skipped over all-zero pages, noting them in FSM
-			 * but not bothering to initialize them just yet; so we mustn't
-			 * throw an error here.  (We could skip acquiring the cleanup lock
-			 * if PageIsNew, but it's probably not worth the cycles to test.)
-			 *
-			 * XXX we don't actually need to read the block, we just need to
-			 * confirm it is unpinned. If we had a special call into the
-			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
-			 */
-			buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
-											RBM_NORMAL_NO_LOG);
-			if (BufferIsValid(buffer))
-			{
-				LockBufferForCleanup(buffer);
-				UnlockReleaseBuffer(buffer);
-			}
-		}
+		XLogLockBlockRangeForCleanup(thisrnode, MAIN_FORKNUM,
+									 xlrec->lastBlockVacuumed + 1, thisblkno);
 	}
 
 	/*
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index fa98b82..4023c7e 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -401,10 +401,6 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
  * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
  * with all-zeroes pages up to the given block number.
  *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes.  Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
- *
  * NB: A redo function should normally not call this directly. To get a page
  * to modify, use XLogReplayBuffer instead. It is important that all pages
  * modified by a WAL record are registered in the WAL records, or they will be
@@ -449,8 +445,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
-		if (mode == RBM_NORMAL_NO_LOG)
-			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
@@ -500,6 +494,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 }
 
 /*
+ * XLogLockBlockRangeForCleanup is used in Hot Standby mode to emulate
+ * the locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+							 BlockNumber startBlkno, BlockNumber uptoBlkno)
+{
+	BlockNumber blkno;
+	BlockNumber lastblock;
+	BlockNumber endingBlkno;
+	Buffer      buffer;
+	SMgrRelation smgr;
+
+	Assert(startBlkno != P_NEW);
+	Assert(uptoBlkno != P_NEW);
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forkNum, true);
+
+	lastblock = smgrnblocks(smgr, forkNum);
+
+	endingBlkno = uptoBlkno;
+	if (lastblock < endingBlkno)
+		endingBlkno = lastblock;
+
+	for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+	{
+		/*
+		 * All we need to do here is prove that we can lock each block
+		 * with a cleanup lock. It's not an error to see all-zero pages
+		 * here because the original btvacuumpage would not have thrown
+		 * an error either.
+		 *
+		 * We don't actually need to read the block, we just need to
+		 * confirm it is unpinned.
+		 */
+		buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+		if (BufferIsValid(buffer))
+		{
+			LockBufferForCleanup(buffer);
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+}
+
+/*
  * Struct actually returned by XLogFakeRelcacheEntry, though the declared
  * return type is Relation.
  */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cc973b5..24c409d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -529,8 +529,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
  * a cleanup-strength lock on the page.
  *
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
@@ -591,6 +589,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum)
+{
+	BufferTag	bufTag;			/* identity of requested block */
+	uint32		bufHash;		/* hash value for newTag */
+	LWLock	   *bufPartitionLock;		/* buffer partition lock for it */
+	int			buf_id;
+	volatile BufferDesc *buf;
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	bool		valid = false;
+
+	Assert(InRecovery);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	bufHash = BufTableHashCode(&bufTag);
+	bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(bufPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&bufTag, bufHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it. Now, try to pin the buffer if it's valid, but leave
+		 * its usage count alone.
+		 */
+		buf = &BufferDescriptors[buf_id];
+
+		LockBufHdr(buf);
+		valid = (buf->flags & BM_VALID) != 0;
+		if (valid)
+			PinBuffer_Locked(buf);
+		else
+			UnlockBufHdr(buf);
+	}
+	LWLockRelease(bufPartitionLock);
+
+	if (valid)
+		return BufferDescriptorGetBuffer(buf);
+
+	return InvalidBuffer;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 8cf51c7..b4cd762 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -43,7 +43,9 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record,
 
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 					   BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+										 BlockNumber startBlkno,
+										 BlockNumber uptoBlkno);
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..b0bba41 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -40,9 +40,7 @@ typedef enum
 								 * initialize. Also locks the page. */
 	RBM_ZERO_AND_CLEANUP_LOCK,	/* Like RBM_ZERO_AND_LOCK, but locks the page
 								 * in "cleanup" mode */
-	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
-	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
-								 * replay; otherwise same as RBM_NORMAL */
+	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */
@@ -153,6 +151,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+									   BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);