Eagerly evict bulkwrite strategy ring

Started by Melanie Plageman6 months ago13 messages

melanieplageman@gmail.com

6 months ago

1 attachment(s)

Hi,

While discussing the next steps for AIO writes in Postgres, Andres
suggested that a good starting point would be to begin evicting more
than one buffer at a time in some of the buffer access strategies that
perform writes. This would make it easier to later combine these
writes and, eventually, issue them asynchronously.

The attached patch implements this behavior for the BAS_BULKWRITE
strategy. With the patch applied, I observe average performance
improvements of about 15-20% for parallel COPY FROM operations on the
same table.

After some analysis, this improvement appears to be primarily due to
reduced time spent by each backend waiting on the lock to flush WAL.

Since backends now issue more data file writes before each WAL flush
(using a heuristic that avoids eviction when it would require flushing
WAL), there is less interleaving between WAL flushes and data file
writes. With the patch applied, I observe client backends waiting
significantly less on the WALWriteLock. I also see lower f_await times
in iostat, suggesting reduced flush-related waiting at the kernel
level as well.

It's worth noting that for the serial COPY case (a single COPY FROM),
performance remains essentially unchanged with the patch. The benefit
seems to emerge only when multiple backends are concurrently writing
data and flushing WAL. In fact, the benefits go down the fewer
parallel COPY FROM operations are performed at a time.

The benchmark I did was simple:

-- make 16 source data files that are >= 1GB each

initdb
pg_ctl start
createdb

sudo fstrim -v /mnt/data

psql -c "drop table foo; create table foo(a int, b int) with
(autovacuum_enabled = off);"

time pgbench \
--no-vacuum \
-c 16 \
-j 16 \
-t 4 \
-f- <<EOF
COPY foo FROM '/mnt/data/foo:client_id.data';
EOF

master -> patch
6.2 minutes -> 5 minutes : ~20% reduction

A 15% improvement can be noticed with the same benchmark but 4 workers.

- Melanie

Attachments:

v1-0001-Eager-evict-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Eager-evict-bulkwrite-strategy-ring.patchDownload

From 9f18bac7869810914e0dfde2fc14060293bcd5b4 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 30 Jun 2025 18:04:33 -0400
Subject: [PATCH v1 1/2] Eager evict bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring buffer in order to reuse
them. By eagerly evicting the buffers in a larger batch, we incur less
interleaving of WAL flushes and data file writes. The effect is mainly
noticeable with multiple parallel COPY FROMs. In this case, client
backends achieve higher write throughput and end up spending less time
waiting on acquiring the lock to flush WAL. Larger flush operations also
mean less time waiting for flush operations at the kernel level as well.

The heuristic for eager eviction is to only evict buffers in the
strategy ring which flushing does not require flushing WAL.
---
 src/backend/storage/buffer/bufmgr.c   | 72 +++++++++++++++++++++++++++
 src/backend/storage/buffer/freelist.c | 53 ++++++++++++++++++++
 src/include/storage/buf_internals.h   |  1 +
 src/include/storage/bufmgr.h          |  2 +
 4 files changed, 128 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6afdd28dba6..ca7d900e7ec 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2346,6 +2346,75 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
 	return true;
 }

+/*
+ * Pin and lock a shared buffer and then evict it. Don't evict the buffer if
+ * doing so would mean we have to flush WAL. We only evict the buffer if doing
+ * so is "cheap", i.e. we're able to lock the buffer and we don't have to
+ * flush WAL. This is appropriate for occasions in which we don't need to
+ * guarantee that the buffer is flushed.
+ *
+ * Returns true if the buffer was flushed, false otherwise.
+ */
+bool
+QuickCleanBuffer(BufferDesc *bufdesc, IOContext io_context)
+{
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	LWLock	   *content_lock;
+	Buffer		buffer;
+
+	buffer = BufferDescriptorGetBuffer(bufdesc);
+	buf_state = LockBufHdr(bufdesc);
+
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * No need to evict the buffer if it isn't dirty. We won't flush buffers
+	 * in use by other backends.
+	 */
+	if (!(buf_state & BM_DIRTY) ||
+		BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+		BUF_STATE_GET_USAGECOUNT(buf_state) > 1)
+	{
+		UnlockBufHdr(bufdesc, buf_state);
+		return false;
+	}
+
+	ReservePrivateRefCountEntry();
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+
+	/* Releases buffer header lock before acquiring content lock */
+	PinBuffer_Locked(bufdesc);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+	{
+		UnpinBuffer(bufdesc);
+		return false;
+	}
+
+	CheckBufferIsPinnedOnce(buffer);
+
+	/* Need buffer header lock to get the LSN */
+	buf_state = LockBufHdr(bufdesc);
+	lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+	{
+		UnlockReleaseBuffer(buffer);
+		return false;
+	}
+
+	FlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context);
+
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+
+	UnpinBuffer(bufdesc);
+	return true;
+}
+
 static Buffer
 GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 {
@@ -2451,6 +2520,9 @@ again:

 		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
 									  &buf_hdr->tag);
+
+		if (strategy)
+			EvictStrategyRing(strategy);
 	}


diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 01909be0272..ab38c96e2de 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,31 @@ have_free_buffer(void)
 		return false;
 }

+/*
+ * Some BufferAccessStrategies support eager eviction -- which is evicting
+ * buffers in the ring before they are needed. This can lean to better I/O
+ * patterns than lazily evicting buffers directly before reusing them.
+ */
+bool
+strategy_supports_eager_eviction(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -780,6 +805,34 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	return NULL;
 }

+/*
+ * Evict all the buffers we can in the strategy ring. This encourages write
+ * batching at the kernel level and leaves a ring full of clean buffers. We'll
+ * skip evicting buffers that would require us to flush WAL.
+ */
+void
+EvictStrategyRing(BufferAccessStrategy strategy)
+{
+	IOContext	io_context;
+
+	if (!strategy_supports_eager_eviction(strategy))
+		return;
+
+	io_context = IOContextForStrategy(strategy);
+
+	for (int i = 0; i < strategy->nbuffers; i++)
+	{
+		BufferDesc *bufdesc;
+		Buffer		bufnum = strategy->buffers[i];
+
+		if (bufnum == InvalidBuffer)
+			continue;
+		bufdesc = GetBufferDescriptor(bufnum - 1);
+		QuickCleanBuffer(bufdesc, io_context);
+	}
+}
+
+
 /*
  * AddBufferToRing -- add a buffer to the buffer ring
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 52a71b138f7..4d3f9552027 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -433,6 +433,7 @@ extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
+extern bool QuickCleanBuffer(BufferDesc *bufdesc, IOContext io_context);

 /* solely to make it easier to write tests */
 extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..a4d122fa3c5 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -331,8 +331,10 @@ extern BufferAccessStrategy GetAccessStrategyWithSize(BufferAccessStrategyType b
 extern int	GetAccessStrategyBufferCount(BufferAccessStrategy strategy);
 extern int	GetAccessStrategyPinLimit(BufferAccessStrategy strategy);

+extern void EvictStrategyRing(BufferAccessStrategy strategy);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);

+extern bool strategy_supports_eager_eviction(BufferAccessStrategy strategy);

 /* inline functions */

--
2.43.0

Kirill Reshke

reshkekirill@gmail.com

5 months ago

In reply to: Melanie Plageman (#1)

Re: Eagerly evict bulkwrite strategy ring

On Wed, 16 Jul 2025 at 04:51, Melanie Plageman
<melanieplageman@gmail.com> wrote:

Hi,

While discussing the next steps for AIO writes in Postgres, Andres
suggested that a good starting point would be to begin evicting more
than one buffer at a time in some of the buffer access strategies that
perform writes. This would make it easier to later combine these
writes and, eventually, issue them asynchronously.

The attached patch implements this behavior for the BAS_BULKWRITE
strategy. With the patch applied, I observe average performance
improvements of about 15-20% for parallel COPY FROM operations on the
same table.

After some analysis, this improvement appears to be primarily due to
reduced time spent by each backend waiting on the lock to flush WAL.

Since backends now issue more data file writes before each WAL flush
(using a heuristic that avoids eviction when it would require flushing
WAL), there is less interleaving between WAL flushes and data file
writes. With the patch applied, I observe client backends waiting
significantly less on the WALWriteLock. I also see lower f_await times
in iostat, suggesting reduced flush-related waiting at the kernel
level as well.

It's worth noting that for the serial COPY case (a single COPY FROM),
performance remains essentially unchanged with the patch. The benefit
seems to emerge only when multiple backends are concurrently writing
data and flushing WAL. In fact, the benefits go down the fewer
parallel COPY FROM operations are performed at a time.

Hi!

1) In EvictStrategyRing we find io context for strategy:

+ io_context = IOContextForStrategy(strategy);

but the caller of this function (GetVictimBuffer) already has one.
Should we reuse its context, pass it as function param to
EvictStrategyRing?

2) QuickCleanBuffer function has a return value which is never
checked. Should we change the signature to `void QuickCleanBuffer
(...)` ?

3) In QuickCleanBuffer, we have `buffer =
BufferDescriptorGetBuffer(bufdesc);`, while in QuickCleanBuffer we do
the opposite right before QuickCleanBuffer call. Should we pass
`bufnum` as a parameter?

The benchmark I did was simple:

-- make 16 source data files that are >= 1GB each

initdb
pg_ctl start
createdb

sudo fstrim -v /mnt/data

psql -c "drop table foo; create table foo(a int, b int) with
(autovacuum_enabled = off);"

time pgbench \
--no-vacuum \
-c 16 \
-j 16 \
-t 4 \
-f- <<EOF
COPY foo FROM '/mnt/data/foo:client_id.data';
EOF

master -> patch
6.2 minutes -> 5 minutes : ~20% reduction

A 15% improvement can be noticed with the same benchmark but 4 workers.

- Melanie

In only get 5-10% improvements

I did this benchmark also. For 16 source data files that are 150MB
each I get 5-10 % speedup (5% with small shared_buffers and 10 % with
big shared_buffers).

--
Best regards,
Kirill Reshke

Melanie Plageman

melanieplageman@gmail.com

5 months ago

In reply to: Kirill Reshke (#2)

1 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Mon, Aug 18, 2025 at 2:54 AM Kirill Reshke <reshkekirill@gmail.com> wrote:

Hi!

Thanks for the review!

1) In EvictStrategyRing we find io context for strategy:

+ io_context = IOContextForStrategy(strategy);

but the caller of this function (GetVictimBuffer) already has one.
Should we reuse its context, pass it as function param to
EvictStrategyRing?

Done in attached v2.

2) QuickCleanBuffer function has a return value which is never
checked. Should we change the signature to `void QuickCleanBuffer
(...)` ?

Done in attached v2.

3) In QuickCleanBuffer, we have `buffer =
BufferDescriptorGetBuffer(bufdesc);`, while in QuickCleanBuffer we do
the opposite right before QuickCleanBuffer call. Should we pass
`bufnum` as a parameter?

Done in attached v2.

v2 also changes instances of the word "evict" to simply "flush"
because we don't actually invalidate the contents of the buffer -- we
just flush it so that it can be cheaply evicted when it is needed.

Also, I noticed that v1 had an issue -- it goes through the buffers
from 0 to nbuffers and flushes them instead of starting at the buffer
just after the one we flushed and doing a single sweep. I've fixed
that in the attached. It makes it more likely we'll flush a buffer
we're about to use.

A 15% improvement can be noticed with the same benchmark but 4 workers.

- Melanie

In only get 5-10% improvements

I did this benchmark also. For 16 source data files that are 150MB
each I get 5-10 % speedup (5% with small shared_buffers and 10 % with
big shared_buffers).

Yes, one possible reason is that with smaller source files (150 MB vs
1 GB) is that the times around the ring for each COPY are less vs. the
setup and one time costs. And there are fewer sustained periods of
time where the different COPYies are ongoing doing the same
operations.

However, the more likely reason you see a smaller improvement is just
variance from one SSD and CPU to the next. I tried on three models of
SSD and two of CPU (all local [not cloud]) and saw variation in the
improvement -- on some models the improvement was less. For all cases,
I had to turn off CPU turbo boosting and idling to see consistent
results.

It's possible that this performance improvement of this patch by
itself isn't large enough to merit committing it.

I just finished a draft of a patch set to do write combining for COPY
FROM using this same heuristic as this patch for deciding to eagerly
flush but then combining multiple buffers into a single IO. That has
larger performance gains, so one could argue to wait to do that.

- Melanie

Attachments:

v2-0001-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v2-0001-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From 3164ce4ebea1f7ad05f252f335f449c382e6fd8f Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 25 Aug 2025 14:13:04 -0400
Subject: [PATCH v2 1/2] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse
them. By eagerly flushing the buffers in a larger batch, we encourage
larger writes at the kernel level and less interleaving of WAL flushes
and data file writes. The effect is mainly noticeable with multiple
parallel COPY FROMs. In this case, client backends achieve higher write
throughput and end up spending less time waiting on acquiring the lock
to flush WAL. Larger flush operations also mean less time waiting for
flush operations at the kernel level as well.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which flushing does not require flushing WAL.

This patch also is a stepping stone toward using AIO for COPY FROM.

Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 71 +++++++++++++++++++++++++++
 src/backend/storage/buffer/freelist.c | 67 +++++++++++++++++++++++++
 src/include/storage/buf_internals.h   |  1 +
 src/include/storage/bufmgr.h          |  2 +
 4 files changed, 141 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fd7e21d96d3..46fe206ddfd 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2341,6 +2341,74 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
 	return true;
 }
 
+/*
+ * Pin and lock a shared buffer and then flush it. Don't flush the buffer if
+ * doing so would mean we have to flush WAL. We only evict the buffer if doing
+ * so is "cheap", i.e. we're able to lock the buffer and we don't have to
+ * flush WAL. This is appropriate for occasions in which we don't need to
+ * guarantee that the buffer is flushed.
+ */
+void
+QuickCleanBuffer(Buffer buffer, IOContext io_context)
+{
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	LWLock	   *content_lock;
+	BufferDesc *bufdesc;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	bufdesc = GetBufferDescriptor(buffer - 1);
+
+	buf_state = LockBufHdr(bufdesc);
+
+	/*
+	 * No need to flush the buffer if it isn't dirty. We won't flush buffers
+	 * in use by other backends.
+	 */
+	if (!(buf_state & BM_DIRTY) ||
+		BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+		BUF_STATE_GET_USAGECOUNT(buf_state) > 1)
+	{
+		UnlockBufHdr(bufdesc, buf_state);
+		return;
+	}
+
+	ReservePrivateRefCountEntry();
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+
+	/* Releases buffer header lock before acquiring content lock */
+	PinBuffer_Locked(bufdesc);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+	{
+		UnpinBuffer(bufdesc);
+		return;
+	}
+
+	CheckBufferIsPinnedOnce(buffer);
+
+	/* Need buffer header lock to get the LSN */
+	buf_state = LockBufHdr(bufdesc);
+	lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	FlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context);
+
+	LWLockRelease(content_lock);
+	UnpinBuffer(bufdesc);
+
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
+
 static Buffer
 GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 {
@@ -2446,6 +2514,9 @@ again:
 
 		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
 									  &buf_hdr->tag);
+
+		if (strategy && from_ring)
+			EagerFlushStrategyRing(strategy, io_context);
 	}
 
 
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 01909be0272..a4fefb599c7 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -98,6 +98,10 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
 
+static bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+
+static int	StrategySweepNext(BufferAccessStrategy strategy, int current);
+
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
  *
@@ -180,6 +184,31 @@ have_free_buffer(void)
 		return false;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * data in buffers in the ring before they are needed. This can lead to better
+ * I/O patterns than lazily flushing buffers directly before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -780,6 +809,44 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	return NULL;
 }
 
+static int
+StrategySweepNext(BufferAccessStrategy strategy, int current)
+{
+	int			next = current + 1;
+
+	if (next >= strategy->nbuffers)
+		next = 0;
+	return next;
+}
+
+/*
+ * Flush all the buffers we can in the strategy ring. This encourages write
+ * batching at the kernel level and leaves a ring full of clean buffers. We'll
+ * skip flushing buffers that would require us to flush WAL first.
+ */
+void
+EagerFlushStrategyRing(BufferAccessStrategy strategy, IOContext io_context)
+{
+	int			sweep_start,
+				sweep_current;
+
+	if (!StrategySupportsEagerFlush(strategy))
+		return;
+
+	sweep_start = StrategySweepNext(strategy, strategy->current);
+	sweep_current = sweep_start;
+
+	do
+	{
+		Buffer		bufnum = strategy->buffers[sweep_current];
+
+		if (bufnum != InvalidBuffer)
+			QuickCleanBuffer(bufnum, io_context);
+		sweep_current = StrategySweepNext(strategy, sweep_current);
+	} while (sweep_start != sweep_current);
+}
+
+
 /*
  * AddBufferToRing -- add a buffer to the buffer ring
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 52a71b138f7..974f494557a 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -433,6 +433,7 @@ extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
+extern void QuickCleanBuffer(Buffer buffer, IOContext io_context);
 
 /* solely to make it easier to write tests */
 extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..8214ef982aa 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -14,6 +14,7 @@
 #ifndef BUFMGR_H
 #define BUFMGR_H
 
+#include "pgstat.h"
 #include "port/pg_iovec.h"
 #include "storage/aio_types.h"
 #include "storage/block.h"
@@ -331,6 +332,7 @@ extern BufferAccessStrategy GetAccessStrategyWithSize(BufferAccessStrategyType b
 extern int	GetAccessStrategyBufferCount(BufferAccessStrategy strategy);
 extern int	GetAccessStrategyPinLimit(BufferAccessStrategy strategy);
 
+extern void EagerFlushStrategyRing(BufferAccessStrategy strategy, IOContext io_context);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
 
 
-- 
2.43.0

Melanie Plageman

melanieplageman@gmail.com

4 months ago

In reply to: Melanie Plageman (#3)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Mon, Aug 25, 2025 at 3:03 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:

I just finished a draft of a patch set to do write combining for COPY
FROM using this same heuristic as this patch for deciding to eagerly
flush but then combining multiple buffers into a single IO. That has
larger performance gains, so one could argue to wait to do that.

Attached v3 uses the same code structure as in the checkpointer write
combining thread [1]/messages/by-id/CAAKRu_bcWRvRwZUop_d9vzF9nHAiT+-uPzkJ=S3ShZ1GqeAYOw@mail.gmail.com. I need the refactoring of FlushBuffer() now
included in this set to do the write combining in checkpointer. Upon
testing, I did notice that write combining seemed to have little
effect on COPY FROM beyond what eagerly flushing the buffers in the
ring has. The bottleneck is WAL IO and CPU. While we will need the
COPY FROM write combining to use direct IO, perhaps it is not worth
committing it just yet. For now, this thread remains limited to
eagerly flushing buffers in the BAS_BULKWRITE strategy ring.

- Melanie

[1]: /messages/by-id/CAAKRu_bcWRvRwZUop_d9vzF9nHAiT+-uPzkJ=S3ShZ1GqeAYOw@mail.gmail.com

Attachments:

v3-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v3-0002-Split-FlushBuffer-into-two-parts.patchDownload

From 2c8aafe30fb58516654e7d0cfdbfbb15a6a00498 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:32:24 -0400
Subject: [PATCH v3 2/4] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This provides better symmetry with the
batch flushing code.
---
 src/backend/storage/buffer/bufmgr.c | 103 ++++++++++++++++++++--------
 1 file changed, 76 insertions(+), 27 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f3668051574..27cc418ef61 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -529,8 +529,13 @@ static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
 static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
 static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, uint32 *buf_state, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+						  IOContext io_context, XLogRecPtr buffer_lsn);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static void CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2414,12 +2419,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			CleanVictimBuffer(buf_hdr, &buf_state, from_ring, io_context);
 		}
 
 		if (buf_state & BM_VALID)
@@ -4246,20 +4246,81 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
 	uint32		buf_state;
+	XLogRecPtr	lsn;
+
+	if (PrepareFlushBuffer(buf, &buf_state, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
+
+/*
+ * Prepare to write and write a dirty victim buffer.
+ * bufdesc and buf_state may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+				  bool from_ring, IOContext io_context)
+{
+
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
+	Assert(*buf_state & BM_DIRTY);
+
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, buf_state, &max_lsn))
+		return;
+
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
+
+/*
+ * Prepare the buffer with budesc for writing. buf_state and lsn are output
+ * parameters. Returns true if the buffer acutally needs writing and false
+ * otherwise. All three parameters may be modified.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, uint32 *buf_state, XLogRecPtr *lsn)
+{
 	/*
 	 * Try to start an I/O operation.  If StartBufferIO returns false, then
 	 * someone else flushed the buffer before we could, so we need not do
 	 * anything.
 	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
+
+	*lsn = InvalidXLogRecPtr;
+	*buf_state = LockBufHdr(bufdesc);
+
+	/*
+	 * Run PageGetLSN while holding header lock, since we don't have the
+	 * buffer locked exclusively in all cases.
+	 */
+	if (*buf_state & BM_PERMANENT)
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	*buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, *buf_state);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
 
 	/* Setup error traceback support for ereport() */
 	errcallback.callback = shared_buffer_write_error_callback;
@@ -4277,18 +4338,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 										reln->smgr_rlocator.locator.dbOid,
 										reln->smgr_rlocator.locator.relNumber);
 
-	buf_state = LockBufHdr(buf);
-
-	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
-	 */
-	recptr = BufferGetLSN(buf);
-
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf_state &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf, buf_state);
-
 	/*
 	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
 	 * rule that log updates must hit disk before any of the data-file changes
@@ -4306,8 +4355,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
 	 */
-	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+	if (!XLogRecPtrIsInvalid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v3-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v3-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From 32f8dbe2c885ce45ef7b217c2693333525fb9b89 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 12:43:24 -0400
Subject: [PATCH v3 3/4] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse
them. By eagerly flushing the buffers in a larger batch, we encourage
larger writes at the kernel level and less interleaving of WAL flushes
and data file writes. The effect is mainly noticeable with multiple
parallel COPY FROMs. In this case, client backends achieve higher write
throughput and end up spending less time waiting on acquiring the lock
to flush WAL. Larger flush operations also mean less time waiting for
flush operations at the kernel level as well.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which flushing does not require flushing WAL.

This patch also is a stepping stone toward AIO writes.

Earlier version
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 174 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  63 ++++++++++
 src/include/storage/buf_internals.h   |   3 +
 3 files changed, 234 insertions(+), 6 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 27cc418ef61..d0f40b6a3ec 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -534,7 +534,13 @@ static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object
 						  IOContext io_context, XLogRecPtr buffer_lsn);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static void CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+static BufferDesc *next_strat_buf_to_flush(BufferAccessStrategy strategy, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator,
+												   bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc, uint32 *buf_state,
 							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
@@ -2419,7 +2425,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			CleanVictimBuffer(buf_hdr, &buf_state, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, &buf_state, from_ring, io_context);
 		}
 
 		if (buf_state & BM_VALID)
@@ -4253,17 +4259,44 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or or NULL when there are no further buffers to
+ * consider writing out.
+ */
+static BufferDesc *
+next_strat_buf_to_flush(BufferAccessStrategy strategy,
+						XLogRecPtr *lsn)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum = StrategySweepNextBuffer(strategy)) != InvalidBuffer)
+	{
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare to write and write a dirty victim buffer.
  * bufdesc and buf_state may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc, uint32 *buf_state,
 				  bool from_ring, IOContext io_context)
 {
 
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	Assert(*buf_state & BM_DIRTY);
 
@@ -4271,11 +4304,140 @@ CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
 	if (!PrepareFlushBuffer(bufdesc, buf_state, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && strategy_supports_eager_flush(strategy))
+	{
+		/* Clean victim buffer and find more to flush opportunistically */
+		StartStrategySweep(strategy);
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = next_strat_buf_to_flush(strategy, &max_lsn)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, returns the block -- the pointer to the block data in memory
+ * -- which we will opportunistically flush or NULL if this buffer does not
+ *  contain a block that should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/* Must do this before taking the buffer header spinlock. */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	buf_state = LockBufHdr(bufdesc);
+
+	if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+		goto except_unlock_header;
+
+	/* We don't include used buffers in batches */
+	if (skip_pinned &&
+		(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+		 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+		goto except_unlock_header;
+
+	/* Get page LSN while holding header lock */
+	lsn = BufferGetLSN(bufdesc);
+
+	PinBuffer_Locked(bufdesc);
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* If we'll have to flush WAL to flush the block, we're done */
+	if (buf_state & BM_PERMANENT && XLogNeedsFlush(lsn))
+		goto except_unpin_buffer;
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	buf_state = LockBufHdr(bufdesc);
+	lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc, buf_state);
+
+	if (buf_state & BM_PERMANENT && XLogNeedsFlush(lsn))
+		goto except_unlock_content;
+
+	/* Try to start an I/O operation. */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+	buf_state = LockBufHdr(bufdesc);
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
+
+except_unlock_header:
+	UnlockBufHdr(bufdesc, buf_state);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index ce95afe2e94..025592778f7 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -75,6 +75,15 @@ typedef struct BufferAccessStrategyData
 	 */
 	int			current;
 
+	/*
+	 * If the strategy supports eager flushing, we may initiate a sweep of the
+	 * strategy ring, flushing all the dirty buffers we can cheaply flush.
+	 * sweep_start and sweep_current keep track of a given sweep so we don't
+	 * loop around the ring infinitely.
+	 */
+	int			sweep_start;
+	int			sweep_current;
+
 	/*
 	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
 	 * have not yet selected a buffer for this ring slot.  For allocation
@@ -156,6 +165,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lean to better I/O
+ * patterns than lazily flushing buffers directly before reusing them.
+ */
+bool
+strategy_supports_eager_flush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -270,6 +304,35 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Return the next buffer in the ring or InvalidBuffer if the current sweep is
+ * over.
+ */
+Buffer
+StrategySweepNextBuffer(BufferAccessStrategy strategy)
+{
+	strategy->sweep_current++;
+	if (strategy->sweep_current >= strategy->nbuffers)
+		strategy->sweep_current = 0;
+
+	if (strategy->sweep_current == strategy->sweep_start)
+		return InvalidBuffer;
+
+	return strategy->buffers[strategy->sweep_current];
+}
+
+/*
+ * Start a sweep of the strategy ring.
+ */
+void
+StartStrategySweep(BufferAccessStrategy strategy)
+{
+	if (!strategy)
+		return;
+	strategy->sweep_start = strategy->sweep_current = strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index b1b81f31419..7963d1189a6 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -437,6 +437,9 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool strategy_supports_eager_flush(BufferAccessStrategy strategy);
+extern Buffer StrategySweepNextBuffer(BufferAccessStrategy strategy);
+extern void StartStrategySweep(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

v3-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v3-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From dae7c82146c2d73729fc12a742d84b660e6db2ad Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:00:44 -0400
Subject: [PATCH v3 1/4] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with a
regular for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_bcWRvRwZUop_d9vzF9nHAiT%2B-uPzkJ%3DS3ShZ1GqeAYOw%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 200 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  32 ++++-
 src/include/storage/buf_internals.h   |   5 +
 3 files changed, 124 insertions(+), 113 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fe470de63f2..f3668051574 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2344,130 +2340,122 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned with its header
-	 * spinlock still held!
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
-
-	/* Pin the buffer and then release the buffer spinlock */
-	PinBuffer_Locked(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
+		/*
+		 * Attempt to claim a victim buffer.  The buffer is returned with its
+		 * header spinlock still held!
+		 */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+
+		/* Pin the buffer and then release the buffer spinlock */
+		PinBuffer_Locked(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr, buf_state);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
+
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
 
-	if (buf_state & BM_VALID)
-	{
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer. Then loop around and try again.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7d59a92bd1a..ce95afe2e94 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -716,12 +717,21 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must pinned and content locked and the buffer header spinlock
+ * must not be held. We must have the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
- * if this buffer should be written and re-used.
+ * if this buffer should be written and reused.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -731,11 +741,19 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
-	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
-	 * loop if all ring members are dirty.
-	 */
-	strategy->buffers[strategy->current] = InvalidBuffer;
+	buf_state = LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+	{
+		/*
+		 * Remove the dirty buffer from the ring; necessary to prevent an
+		 * infinite loop if all ring members are dirty.
+		 */
+		strategy->buffers[strategy->current] = InvalidBuffer;
+		return true;
+	}
 
-	return true;
+	return false;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index dfd614f7ca4..b1b81f31419 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -419,6 +419,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

Nazir Bilal Yavuz

byavuz81@gmail.com

4 months ago

In reply to: Melanie Plageman (#4)

Re: Eagerly evict bulkwrite strategy ring

Hi,

On Tue, 9 Sept 2025 at 20:37, Melanie Plageman
<melanieplageman@gmail.com> wrote:

On Mon, Aug 25, 2025 at 3:03 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:

I just finished a draft of a patch set to do write combining for COPY
FROM using this same heuristic as this patch for deciding to eagerly
flush but then combining multiple buffers into a single IO. That has
larger performance gains, so one could argue to wait to do that.

Attached v3 uses the same code structure as in the checkpointer write
combining thread [1]. I need the refactoring of FlushBuffer() now
included in this set to do the write combining in checkpointer. Upon
testing, I did notice that write combining seemed to have little
effect on COPY FROM beyond what eagerly flushing the buffers in the
ring has. The bottleneck is WAL IO and CPU. While we will need the
COPY FROM write combining to use direct IO, perhaps it is not worth
committing it just yet. For now, this thread remains limited to
eagerly flushing buffers in the BAS_BULKWRITE strategy ring.

Reviewing patches here instead of the checkpointer write combining thread.

From dae7c82146c2d73729fc12a742d84b660e6db2ad Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:00:44 -0400
Subject: [PATCH v3 1/4] Refactor goto into for loop in GetVictimBuffer()

LGTM.

From 2c8aafe30fb58516654e7d0cfdbfbb15a6a00498 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:32:24 -0400
Subject: [PATCH v3 2/4] Split FlushBuffer() into two parts

Codewise LGTM. Two minor points:

1- Commit message only mentions PrepareFlushBuffer() and
DoFlushBuffer(), I did not expect to see the CleanVictimBuffer().
Maybe it is worth mentioning it in the commit message.

2-
+/*
+ * Prepare to write and write a dirty victim buffer.

Although this comment is correct, it is a bit complicated for me. How
about 'Prepare to write and then write a dirty victim buffer'?

From 32f8dbe2c885ce45ef7b217c2693333525fb9b89 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 12:43:24 -0400
Subject: [PATCH v3 3/4] Eagerly flush bulkwrite strategy ring

+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or or NULL when there are no further buffers to

s/or or/ or/.

+ * consider writing out.
+ */
+static BufferDesc *
+next_strat_buf_to_flush(BufferAccessStrategy strategy,
+                        XLogRecPtr *lsn)
+{
+    Buffer        bufnum;
+    BufferDesc *bufdesc;
+
+    while ((bufnum = StrategySweepNextBuffer(strategy)) != InvalidBuffer)
+    {

StrategySweepNextBuffer() returns InvalidBuffer when we reach the
start but can strategy->buffers[strategy->sweep_current] be an
InvalidBuffer? I mean, is the following case possible:
strategy->buffers[strategy->sweep_current] is an InvalidBuffer but
strategy->buffers[strategy->sweep_current + 1] is not. So, we exit
early from the next_strat_buf_to_flush() although there are more
buffers to consider writing out.

+/*
+ * Start a sweep of the strategy ring.
+ */
+void
+StartStrategySweep(BufferAccessStrategy strategy)
+{
+    if (!strategy)
+        return;

I think we will always use this function together with
strategy_supports_eager_flush(), right? If yes, then we do not need to
check if the strategy is NULL. If not, then I think this function
should return boolean to make it explicit that we can not do sweep.

+extern bool strategy_supports_eager_flush(BufferAccessStrategy strategy);

All the functions in the buf_internals.h are pascal case, should we
make this too?

+    /* Must do this before taking the buffer header spinlock. */
+    /* Try to start an I/O operation. */

These one line comments end with a dot.

--
Regards,
Nazir Bilal Yavuz
Microsoft

Melanie Plageman

melanieplageman@gmail.com

4 months ago

In reply to: Nazir Bilal Yavuz (#5)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Wed, Sep 10, 2025 at 6:14 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:

Thanks so much for the review! I've only included inline comments to
things that still might need discussion. Otherwise, I've incorporated
your suggested changes.

From 2c8aafe30fb58516654e7d0cfdbfbb15a6a00498 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:32:24 -0400
Subject: [PATCH v3 2/4] Split FlushBuffer() into two parts
2-
+/*
+ * Prepare to write and write a dirty victim buffer.
Although this comment is correct, it is a bit complicated for me. How
about 'Prepare to write and then write a dirty victim buffer'?

I've gone with * Prepare and write out a dirty victim buffer.

From 32f8dbe2c885ce45ef7b217c2693333525fb9b89 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 12:43:24 -0400
Subject: [PATCH v3 3/4] Eagerly flush bulkwrite strategy ring
+ * consider writing out.
+ */
+static BufferDesc *
+next_strat_buf_to_flush(BufferAccessStrategy strategy,
+                        XLogRecPtr *lsn)
+{
+    Buffer        bufnum;
+    BufferDesc *bufdesc;
+
+    while ((bufnum = StrategySweepNextBuffer(strategy)) != InvalidBuffer)
+    {
StrategySweepNextBuffer() returns InvalidBuffer when we reach the
start but can strategy->buffers[strategy->sweep_current] be an
InvalidBuffer? I mean, is the following case possible:
strategy->buffers[strategy->sweep_current] is an InvalidBuffer but
strategy->buffers[strategy->sweep_current + 1] is not. So, we exit
early from the next_strat_buf_to_flush() although there are more
buffers to consider writing out.

Yes, good thought. Actually for BAS_BULKWRITE this cannot happen
because when a buffer is not reused we overwrite its place in the
buffers array with the shared buffer we then replace it with. It can
happen for BAS_BULKREAD. Since we are only concerned with writing, I
think we can terminate after we hit an InvalidBuffer in the ring.

While looking at this, I decided it didn't make sense to have sweep
variables in the strategy object, so I've actually changed the way
StrategySweepNextBuffer() works. There was also an issue with the
sweep -- it could run into and past the starting buffer. So, I had to
change it. Take a look at the new method and let me know what you
think.

+/*
+ * Start a sweep of the strategy ring.
+ */
+void
+StartStrategySweep(BufferAccessStrategy strategy)
+{
+    if (!strategy)
+        return;
I think we will always use this function together with
strategy_supports_eager_flush(), right? If yes, then we do not need to
check if the strategy is NULL. If not, then I think this function
should return boolean to make it explicit that we can not do sweep.

Yes, I just removed this check.

+extern bool strategy_supports_eager_flush(BufferAccessStrategy strategy);

All the functions in the buf_internals.h are pascal case, should we
make this too?

I thought maybe I'd go a different way because it's sort of
informational and not a function that does stuff -- but anyway you're
right. I've given up and made all my helpers pascal case.

- Melanie

Attachments:

v4-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v4-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From dbcc430c4b92c2a69f84fe9ab3faa94f61eb3d99 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 12:43:24 -0400
Subject: [PATCH v4 3/9] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  48 +++++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 235 insertions(+), 6 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f40f57e5582..c64268f31ae 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -534,7 +534,16 @@ static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object
 						  IOContext io_context, XLogRecPtr buffer_lsn);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static void CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+static BufferDesc *NextStratBufToFlush(BufferAccessStrategy strategy,
+									   Buffer sweep_end,
+									   XLogRecPtr *lsn,
+									   int *sweep_cursor);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator,
+												   bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc, uint32 *buf_state,
 							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
@@ -2420,7 +2429,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, &buf_state, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, &buf_state, from_ring, io_context);
 		}
 
 		if (buf_state & BM_VALID)
@@ -4254,6 +4263,40 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStratBufToFlush(BufferAccessStrategy strategy,
+					Buffer sweep_end,
+					XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategySweepNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4264,12 +4307,14 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc and buf_state may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc, uint32 *buf_state,
 				  bool from_ring, IOContext io_context)
 {
 
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	Assert(*buf_state & BM_DIRTY);
 
@@ -4277,11 +4322,143 @@ CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
 	if (!PrepareFlushBuffer(bufdesc, buf_state, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategySweepStart(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStratBufToFlush(strategy, sweep_end,
+												&max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the block -- the pointer to the block data in memory
+ * -- which we will opportunistically flush or NULL if this buffer does not
+ *  contain a block that should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/* Must do this before taking the buffer header spinlock */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	buf_state = LockBufHdr(bufdesc);
+
+	if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+		goto except_unlock_header;
+
+	/* We don't eagerly flush buffers used by others */
+	if (skip_pinned &&
+		(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+		 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+		goto except_unlock_header;
+
+	/* Get page LSN while holding header lock */
+	lsn = BufferGetLSN(bufdesc);
+
+	PinBuffer_Locked(bufdesc);
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* If we'll have to flush WAL to flush the block, we're done */
+	if (buf_state & BM_PERMANENT && XLogNeedsFlush(lsn))
+		goto except_unpin_buffer;
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	buf_state = LockBufHdr(bufdesc);
+	lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc, buf_state);
+
+	if (buf_state & BM_PERMANENT && XLogNeedsFlush(lsn))
+		goto except_unlock_content;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+	buf_state = LockBufHdr(bufdesc);
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
+
+except_unlock_header:
+	UnlockBufHdr(bufdesc, buf_state);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 12bb7e2312e..8716109221b 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -270,6 +295,29 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Return the next buffer in the ring or InvalidBuffer if the current sweep is
+ * over.
+ */
+Buffer
+StrategySweepNextBuffer(BufferAccessStrategy strategy, int *sweep_cursor)
+{
+	if (++(*sweep_cursor) >= strategy->nbuffers)
+		*sweep_cursor = 0;
+
+	return strategy->buffers[*sweep_cursor];
+}
+
+/*
+ * Return the starting buffer of a sweep of the strategy ring
+ */
+int
+StrategySweepStart(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index b1b81f31419..03faf80e441 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -437,6 +437,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategySweepNextBuffer(BufferAccessStrategy strategy,
+									  int *sweep_cursor);
+extern int	StrategySweepStart(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

v4-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v4-0002-Split-FlushBuffer-into-two-parts.patchDownload

From c782753a430c1c967125509c6390d4e710fd2a63 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:32:24 -0400
Subject: [PATCH v4 2/9] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 141 +++++++++++++++++++---------
 1 file changed, 98 insertions(+), 43 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f3668051574..f40f57e5582 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -529,8 +529,13 @@ static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
 static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
 static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, uint32 *buf_state, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+						  IOContext io_context, XLogRecPtr buffer_lsn);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static void CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2414,12 +2419,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, &buf_state, from_ring, io_context);
 		}
 
 		if (buf_state & BM_VALID)
@@ -4246,53 +4247,66 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
 	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &buf_state, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held exclusively, and the
+ * buffer header spinlock must not be held. The exclusive lock is released and
+ * the buffer is returned pinned but not locked.
+ *
+ * bufdesc and buf_state may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc, uint32 *buf_state,
+				  bool from_ring, IOContext io_context)
+{
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	Assert(*buf_state & BM_DIRTY);
 
-	buf_state = LockBufHdr(buf);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, buf_state, &max_lsn))
+		return;
 
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
+
+/*
+ * Prepare the buffer with budesc for writing. buf_state and lsn are output
+ * parameters. Returns true if the buffer acutally needs writing and false
+ * otherwise. All three parameters may be modified.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, uint32 *buf_state, XLogRecPtr *lsn)
+{
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf_state &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf, buf_state);
+	*lsn = InvalidXLogRecPtr;
+	*buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4305,9 +4319,50 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
-	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+	if (*buf_state & BM_PERMANENT)
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	*buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, *buf_state);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (!XLogRecPtrIsInvalid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v4-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From 6c46b33c7a51990f1d2df0fab7dfea2f88e0861e Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:00:44 -0400
Subject: [PATCH v4 1/9] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 200 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  32 ++++-
 src/include/storage/buf_internals.h   |   5 +
 3 files changed, 124 insertions(+), 113 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fe470de63f2..f3668051574 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2344,130 +2340,122 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned with its header
-	 * spinlock still held!
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
-
-	/* Pin the buffer and then release the buffer spinlock */
-	PinBuffer_Locked(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
+		/*
+		 * Attempt to claim a victim buffer.  The buffer is returned with its
+		 * header spinlock still held!
+		 */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+
+		/* Pin the buffer and then release the buffer spinlock */
+		PinBuffer_Locked(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr, buf_state);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
+
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
 
-	if (buf_state & BM_VALID)
-	{
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer. Then loop around and try again.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7d59a92bd1a..12bb7e2312e 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -716,12 +717,21 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
- * if this buffer should be written and re-used.
+ * if this buffer should be written and reused.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -731,11 +741,19 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
-	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
-	 * loop if all ring members are dirty.
-	 */
-	strategy->buffers[strategy->current] = InvalidBuffer;
+	buf_state = LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+	{
+		/*
+		 * Remove the dirty buffer from the ring; necessary to prevent an
+		 * infinite loop if all ring members are dirty.
+		 */
+		strategy->buffers[strategy->current] = InvalidBuffer;
+		return true;
+	}
 
-	return true;
+	return false;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index dfd614f7ca4..b1b81f31419 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -419,6 +419,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

Nazir Bilal Yavuz

byavuz81@gmail.com

4 months ago

In reply to: Melanie Plageman (#6)

Re: Eagerly evict bulkwrite strategy ring

Hi,

On Fri, 12 Sept 2025 at 02:17, Melanie Plageman
<melanieplageman@gmail.com> wrote:

While looking at this, I decided it didn't make sense to have sweep
variables in the strategy object, so I've actually changed the way
StrategySweepNextBuffer() works. There was also an issue with the
sweep -- it could run into and past the starting buffer. So, I had to
change it. Take a look at the new method and let me know what you
think.

It looks good to me. StrategySweepNextBuffer()'s comment is no longer
correct, though.

--
Regards,
Nazir Bilal Yavuz
Microsoft

Melanie Plageman

melanieplageman@gmail.com

3 months ago

In reply to: Nazir Bilal Yavuz (#7)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Fri, Sep 12, 2025 at 3:16 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:

It looks good to me. StrategySweepNextBuffer()'s comment is no longer
correct, though.

Ah, thanks. I think I fixed that. I also rebased the set and attached v5.

- Melanie

Attachments:

v5-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v5-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From ade120635e4d20b36829200f9e6806063ff4eb7a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:53:48 -0400
Subject: [PATCH v5 1/7] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  20 ++-
 src/include/storage/buf_internals.h   |   6 +
 3 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index edf17ce3ea1..453fa16de84 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2331,125 +2327,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		/* Attempt to claim a victim buffer. Buffer is returned pinned. */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr, buf_state);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
+
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
 
-	if (buf_state & BM_VALID)
-	{
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
+
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7fe34d3ef4c..b76be264eb5 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -779,12 +780,21 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
  * if this buffer should be written and re-used.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -794,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
+	buf_state = LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+		return false;
+
 	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * Remove the dirty buffer from the ring; necessary to prevent an infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
-
 	return true;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c1206a46aba..7e258383048 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -421,6 +421,12 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

v5-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v5-0002-Split-FlushBuffer-into-two-parts.patchDownload

From 6dfe3036229f25b9708109c460ce9c1425111650 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:54:19 -0400
Subject: [PATCH v5 2/7] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 143 +++++++++++++++++++---------
 1 file changed, 100 insertions(+), 43 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 453fa16de84..769138a5373 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -533,6 +533,12 @@ static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
+						  IOObject io_object, IOContext io_context,
+						  XLogRecPtr buffer_lsn);
+static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
+							  IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2394,12 +2400,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4276,53 +4278,67 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
-	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held exclusively, and the
+ * buffer header spinlock must not be held. The exclusive lock is released and
+ * the buffer is returned pinned but not locked.
+ *
+ * bufdesc may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc,
+				  bool from_ring, IOContext io_context)
+{
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	Assert(pg_atomic_read_u32(&bufdesc->state) & BM_DIRTY);
 
-	buf_state = LockBufHdr(buf);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
+		return;
+
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
+
+/*
+ * Prepare the buffer with bufdesc for writing. Returns true if the buffer
+ * acutally needs writing and false otherwise. lsn returns the buffer's LSN if
+ * the table is logged.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
 
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf_state &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf, buf_state);
+	*lsn = InvalidXLogRecPtr;
+	buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4335,9 +4351,50 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
 	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (!XLogRecPtrIsInvalid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v5-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v5-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From 6caf8e9cd9d2d442695f8c90eea6cf82971c3e2b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 13:15:43 -0400
Subject: [PATCH v5 3/7] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 238 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  47 +++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 281 insertions(+), 8 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 769138a5373..7a553b8cdd2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,14 +531,25 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
+
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *NextStratBufToFlush(BufferAccessStrategy strategy,
+									   Buffer sweep_end,
+									   XLogRecPtr *lsn, int *sweep_cursor);
+
+static bool BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator, bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc,
+							   XLogRecPtr *lsn);
 static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						  IOObject io_object, IOContext io_context,
 						  XLogRecPtr buffer_lsn);
-static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
-							  IOContext io_context);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2401,7 +2412,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4284,6 +4295,61 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns true if the buffer needs WAL flushed before it can be written out.
+ * Caller must not already hold the buffer header spinlock.
+ */
+static bool
+BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state = LockBufHdr(bufdesc);
+
+	*lsn = BufferGetLSN(bufdesc);
+
+	UnlockBufHdr(bufdesc, buf_state);
+
+	/*
+	 * See buffer flushing code for more details on why we condition this on
+	 * the relation being logged.
+	 */
+	return buf_state & BM_PERMANENT && XLogNeedsFlush(*lsn);
+}
+
+
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStratBufToFlush(BufferAccessStrategy strategy,
+					Buffer sweep_end,
+					XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategySweepNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4294,12 +4360,14 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc,
 				  bool from_ring, IOContext io_context)
 {
 
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	Assert(pg_atomic_read_u32(&bufdesc->state) & BM_DIRTY);
 
@@ -4307,11 +4375,165 @@ CleanVictimBuffer(BufferDesc *bufdesc,
 	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategySweepStart(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStratBufToFlush(strategy, sweep_end,
+												&max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the buffer descriptor of the buffer to eagerly flush,
+ * pinned and locked, or NULL if this buffer does not contain a block that
+ * should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+static BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		old_buf_state;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/*
+	 * Ensure that theres a free refcount entry and resource owner slot for
+	 * the pin before pinning the buffer. While this may leake a refcount and
+	 * slot if we return without a buffer, we should use that slot the next
+	 * time we try and reserve a spot.
+	 */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether the buffer can be used and pin it if so. Do this using a
+	 * CAS loop, to avoid having to lock the buffer header. We have to lock
+	 * the buffer header later if we succeed in pinning the buffer here, but
+	 * avoiding locking the buffer header if the buffer is in use is worth it.
+	 */
+	old_buf_state = pg_atomic_read_u32(&bufdesc->state);
+
+	for (;;)
+	{
+		buf_state = old_buf_state;
+
+		if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+			return NULL;
+
+		/* We don't eagerly flush buffers used by others */
+		if (skip_pinned &&
+			(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+			 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+			return NULL;
+
+		if (unlikely(buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(bufdesc);
+			continue;
+		}
+
+		/* pin the buffer if the CAS succeeds */
+		buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&bufdesc->state, &old_buf_state,
+										   buf_state))
+		{
+			TrackNewBufferPin(BufferDescriptorGetBuffer(bufdesc));
+			break;
+		}
+	}
+
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
+	/* Don't eagerly flush buffers requiring WAL flush */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+
+	buf_state = LockBufHdr(bufdesc);
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index b76be264eb5..71fed9d6ebd 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -307,6 +332,28 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Return the next buffer in the ring
+ */
+Buffer
+StrategySweepNextBuffer(BufferAccessStrategy strategy, int *sweep_cursor)
+{
+	if (++(*sweep_cursor) >= strategy->nbuffers)
+		*sweep_cursor = 0;
+
+	return strategy->buffers[*sweep_cursor];
+}
+
+/*
+ * Return the starting buffer of a sweep of the strategy ring
+ */
+int
+StrategySweepStart(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 7e258383048..b48dece3e63 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -442,6 +442,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategySweepNextBuffer(BufferAccessStrategy strategy,
+									  int *sweep_cursor);
+extern int	StrategySweepStart(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

Melanie Plageman

melanieplageman@gmail.com

2 months ago

In reply to: Melanie Plageman (#8)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Wed, Oct 15, 2025 at 5:00 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:

Ah, thanks. I think I fixed that. I also rebased the set and attached v5.

I found an incorrect assert in CleanVictimBuffer() that was tripping
in CI. Attached v6 is updated with it removed.

- Melanie

Attachments:

v6-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v6-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From 85896f09d712adbc2b3e4ce1aa569170ae5c4c45 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:53:48 -0400
Subject: [PATCH v6 1/7] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  20 ++-
 src/include/storage/buf_internals.h   |   6 +
 3 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e8544acb784..1fadeddf505 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2325,125 +2321,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		/* Attempt to claim a victim buffer. Buffer is returned pinned. */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr, buf_state);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
+
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
 
-	if (buf_state & BM_VALID)
-	{
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
+
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7fe34d3ef4c..b76be264eb5 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -779,12 +780,21 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
  * if this buffer should be written and re-used.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -794,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
+	buf_state = LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf, buf_state);
+
+	if (XLogNeedsFlush(lsn))
+		return false;
+
 	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * Remove the dirty buffer from the ring; necessary to prevent an infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
-
 	return true;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c1206a46aba..7e258383048 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -421,6 +421,12 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

v6-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v6-0002-Split-FlushBuffer-into-two-parts.patchDownload

From c6ded53d41ae967ab335508aae7c82d95689165b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:54:19 -0400
Subject: [PATCH v6 2/7] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 141 +++++++++++++++++++---------
 1 file changed, 98 insertions(+), 43 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1fadeddf505..474afdcb4fe 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -533,6 +533,12 @@ static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
+						  IOObject io_object, IOContext io_context,
+						  XLogRecPtr buffer_lsn);
+static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
+							  IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2388,12 +2394,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4271,53 +4273,65 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
-	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held exclusively, and the
+ * buffer header spinlock must not be held. The exclusive lock is released and
+ * the buffer is returned pinned but not locked.
+ *
+ * bufdesc may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc,
+				  bool from_ring, IOContext io_context)
+{
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
+		return;
 
-	buf_state = LockBufHdr(buf);
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
+
+/*
+ * Prepare the buffer with bufdesc for writing. Returns true if the buffer
+ * acutally needs writing and false otherwise. lsn returns the buffer's LSN if
+ * the table is logged.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
 
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf_state &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf, buf_state);
+	*lsn = InvalidXLogRecPtr;
+	buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4330,9 +4344,50 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
 	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (!XLogRecPtrIsInvalid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v6-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v6-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From 6c8be932d24a54d1208b2356960f6352d9808b7d Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 13:15:43 -0400
Subject: [PATCH v6 3/7] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 238 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  47 +++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 281 insertions(+), 8 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 474afdcb4fe..41690fd9165 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,14 +531,25 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
+
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *NextStratBufToFlush(BufferAccessStrategy strategy,
+									   Buffer sweep_end,
+									   XLogRecPtr *lsn, int *sweep_cursor);
+
+static bool BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator, bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc,
+							   XLogRecPtr *lsn);
 static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						  IOObject io_object, IOContext io_context,
 						  XLogRecPtr buffer_lsn);
-static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
-							  IOContext io_context);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2395,7 +2406,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4279,6 +4290,61 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns true if the buffer needs WAL flushed before it can be written out.
+ * Caller must not already hold the buffer header spinlock.
+ */
+static bool
+BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state = LockBufHdr(bufdesc);
+
+	*lsn = BufferGetLSN(bufdesc);
+
+	UnlockBufHdr(bufdesc, buf_state);
+
+	/*
+	 * See buffer flushing code for more details on why we condition this on
+	 * the relation being logged.
+	 */
+	return buf_state & BM_PERMANENT && XLogNeedsFlush(*lsn);
+}
+
+
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStratBufToFlush(BufferAccessStrategy strategy,
+					Buffer sweep_end,
+					XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategySweepNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4289,22 +4355,178 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc,
 				  bool from_ring, IOContext io_context)
 {
 
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	/* Set up this victim buffer to be flushed */
 	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategySweepStart(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStratBufToFlush(strategy, sweep_end,
+												&max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the buffer descriptor of the buffer to eagerly flush,
+ * pinned and locked, or NULL if this buffer does not contain a block that
+ * should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+static BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		old_buf_state;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/*
+	 * Ensure that theres a free refcount entry and resource owner slot for
+	 * the pin before pinning the buffer. While this may leake a refcount and
+	 * slot if we return without a buffer, we should use that slot the next
+	 * time we try and reserve a spot.
+	 */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether the buffer can be used and pin it if so. Do this using a
+	 * CAS loop, to avoid having to lock the buffer header. We have to lock
+	 * the buffer header later if we succeed in pinning the buffer here, but
+	 * avoiding locking the buffer header if the buffer is in use is worth it.
+	 */
+	old_buf_state = pg_atomic_read_u32(&bufdesc->state);
+
+	for (;;)
+	{
+		buf_state = old_buf_state;
+
+		if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+			return NULL;
+
+		/* We don't eagerly flush buffers used by others */
+		if (skip_pinned &&
+			(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+			 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+			return NULL;
+
+		if (unlikely(buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(bufdesc);
+			continue;
+		}
+
+		/* pin the buffer if the CAS succeeds */
+		buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&bufdesc->state, &old_buf_state,
+										   buf_state))
+		{
+			TrackNewBufferPin(BufferDescriptorGetBuffer(bufdesc));
+			break;
+		}
+	}
+
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
+	/* Don't eagerly flush buffers requiring WAL flush */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+
+	buf_state = LockBufHdr(bufdesc);
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(bufdesc, buf_state);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index b76be264eb5..71fed9d6ebd 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -307,6 +332,28 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Return the next buffer in the ring
+ */
+Buffer
+StrategySweepNextBuffer(BufferAccessStrategy strategy, int *sweep_cursor)
+{
+	if (++(*sweep_cursor) >= strategy->nbuffers)
+		*sweep_cursor = 0;
+
+	return strategy->buffers[*sweep_cursor];
+}
+
+/*
+ * Return the starting buffer of a sweep of the strategy ring
+ */
+int
+StrategySweepStart(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 7e258383048..b48dece3e63 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -442,6 +442,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategySweepNextBuffer(BufferAccessStrategy strategy,
+									  int *sweep_cursor);
+extern int	StrategySweepStart(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

#10

Melanie Plageman

melanieplageman@gmail.com

about 2 months ago

In reply to: Melanie Plageman (#9)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Mon, Nov 3, 2025 at 3:06 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:

I found an incorrect assert in CleanVictimBuffer() that was tripping
in CI. Attached v6 is updated with it removed.

v7 rebased over recent changes in bufmgr.c

- Melanie

Attachments:

v7-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v7-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From 59432c8962cd1d7866493492149963485f6e63e1 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:53:48 -0400
Subject: [PATCH v7 1/7] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  19 ++-
 src/include/storage/buf_internals.h   |   5 +
 3 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 327ddb7adc8..90c24b8d93d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2331,125 +2327,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		/* Attempt to claim a victim buffer. Buffer is returned pinned. */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
+
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
 
-	if (buf_state & BM_VALID)
-	{
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
+
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 28d952b3534..1465984b141 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -780,12 +781,20 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
  * if this buffer should be written and re-used.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -795,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
+	LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf);
+
+	if (XLogNeedsFlush(lsn))
+		return false;
+
 	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * Remove the dirty buffer from the ring; necessary to prevent an infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
-
 	return true;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 5400c56a965..04fdea64f83 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -486,6 +486,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

v7-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v7-0002-Split-FlushBuffer-into-two-parts.patchDownload

From 8dff2f6ba609909f2820acde080228b799f3fb02 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:54:19 -0400
Subject: [PATCH v7 2/7] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 142 +++++++++++++++++++---------
 1 file changed, 98 insertions(+), 44 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 90c24b8d93d..235e261738b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -533,6 +533,12 @@ static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
+						  IOObject io_object, IOContext io_context,
+						  XLogRecPtr buffer_lsn);
+static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
+							  IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2394,12 +2400,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4270,54 +4272,64 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
-	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held exclusively, and the
+ * buffer header spinlock must not be held. The exclusive lock is released and
+ * the buffer is returned pinned but not locked.
+ *
+ * bufdesc may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc,
+				  bool from_ring, IOContext io_context)
+{
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
+		return;
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
 
-	buf_state = LockBufHdr(buf);
+/*
+ * Prepare the buffer with bufdesc for writing. Returns true if the buffer
+ * acutally needs writing and false otherwise. lsn returns the buffer's LSN if
+ * the table is logged.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
 
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	UnlockBufHdrExt(buf, buf_state,
-					0, BM_JUST_DIRTIED,
-					0);
+	*lsn = InvalidXLogRecPtr;
+	buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4330,9 +4342,51 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
 	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	UnlockBufHdrExt(bufdesc, buf_state,
+					0, BM_JUST_DIRTIED,
+					0);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (XLogRecPtrIsValid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v7-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v7-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From d4d815c9ef5a270d8e9c7dde59e2121d3c1586a7 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 13:15:43 -0400
Subject: [PATCH v7 3/7] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes, as it lines up multiple
buffers that can be issued asynchronously once the infrastructure
exists.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 237 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  48 ++++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 281 insertions(+), 8 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 235e261738b..57a3eae865e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,14 +531,25 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
+
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *NextStrategyBufToFlush(BufferAccessStrategy strategy,
+										  Buffer sweep_end,
+										  XLogRecPtr *lsn, int *sweep_cursor);
+
+static bool BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator, bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc,
+							   XLogRecPtr *lsn);
 static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						  IOObject io_object, IOContext io_context,
 						  XLogRecPtr buffer_lsn);
-static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
-							  IOContext io_context);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2401,7 +2412,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4278,6 +4289,61 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns true if the buffer needs WAL flushed before it can be written out.
+ * Caller must not already hold the buffer header spinlock.
+ */
+static bool
+BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state = LockBufHdr(bufdesc);
+
+	*lsn = BufferGetLSN(bufdesc);
+
+	UnlockBufHdr(bufdesc);
+
+	/*
+	 * See buffer flushing code for more details on why we condition this on
+	 * the relation being logged.
+	 */
+	return buf_state & BM_PERMANENT && XLogNeedsFlush(*lsn);
+}
+
+
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStrategyBufToFlush(BufferAccessStrategy strategy,
+					   Buffer sweep_end,
+					   XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategyNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4288,21 +4354,176 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc,
 				  bool from_ring, IOContext io_context)
 {
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	/* Set up this victim buffer to be flushed */
 	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategyGetCurrentIndex(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStrategyBufToFlush(strategy, sweep_end,
+												   &max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the buffer descriptor of the buffer to eagerly flush,
+ * pinned and locked, or NULL if this buffer does not contain a block that
+ * should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+static BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		old_buf_state;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/*
+	 * Ensure that theres a free refcount entry and resource owner slot for
+	 * the pin before pinning the buffer. While this may leake a refcount and
+	 * slot if we return without a buffer, we should use that slot the next
+	 * time we try and reserve a spot.
+	 */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether the buffer can be used and pin it if so. Do this using a
+	 * CAS loop, to avoid having to lock the buffer header. We have to lock
+	 * the buffer header later if we succeed in pinning the buffer here, but
+	 * avoiding locking the buffer header if the buffer is in use is worth it.
+	 */
+	old_buf_state = pg_atomic_read_u32(&bufdesc->state);
+
+	for (;;)
+	{
+		buf_state = old_buf_state;
+
+		if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+			return NULL;
+
+		/* We don't eagerly flush buffers used by others */
+		if (skip_pinned &&
+			(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+			 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+			return NULL;
+
+		if (unlikely(buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(bufdesc);
+			continue;
+		}
+
+		/* pin the buffer if the CAS succeeds */
+		buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&bufdesc->state, &old_buf_state,
+										   buf_state))
+		{
+			TrackNewBufferPin(BufferDescriptorGetBuffer(bufdesc));
+			break;
+		}
+	}
+
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
+	/* Don't eagerly flush buffers requiring WAL flush */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+
+	buf_state = LockBufHdr(bufdesc);
+	UnlockBufHdrExt(bufdesc, buf_state, 0, BM_JUST_DIRTIED, 0);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 1465984b141..10301f4aab2 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -307,6 +332,29 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Given a position in the ring, cursor, increment the position, and return
+ * the buffer at this position.
+ */
+Buffer
+StrategyNextBuffer(BufferAccessStrategy strategy, int *cursor)
+{
+	if (++(*cursor) >= strategy->nbuffers)
+		*cursor = 0;
+
+	return strategy->buffers[*cursor];
+}
+
+/*
+ * Return the current slot in the strategy ring.
+ */
+int
+StrategyGetCurrentIndex(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 04fdea64f83..c07e309a288 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -506,6 +506,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategyNextBuffer(BufferAccessStrategy strategy,
+								 int *cursor);
+extern int	StrategyGetCurrentIndex(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

#11

Chao Li

li.evan.chao@gmail.com

about 2 months ago

In reply to: Melanie Plageman (#10)

Re: Eagerly evict bulkwrite strategy ring

Hi Melanie,

I remember I ever reviewed this patch. But when I revisit v7, I just got a confusion to clarify.

On Nov 19, 2025, at 03:13, Melanie Plageman <melanieplageman@gmail.com> wrote:

On Mon, Nov 3, 2025 at 3:06 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:

I found an incorrect assert in CleanVictimBuffer() that was tripping
in CI. Attached v6 is updated with it removed.

v7 rebased over recent changes in bufmgr.c

- Melanie
<v7-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patch><v7-0002-Split-FlushBuffer-into-two-parts.patch><v7-0003-Eagerly-flush-bulkwrite-strategy-ring.patch>

0001 only changes “goto" to “for”, thus it's supposed no logic change.

In the old code:
```
if (strategy != NULL)
{
XLogRecPtr lsn;

/* Read the LSN while holding buffer header lock */
buf_state = LockBufHdr(buf_hdr);
lsn = BufferGetLSN(buf_hdr);
UnlockBufHdr(buf_hdr);

if (XLogNeedsFlush(lsn)
&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
{
LWLockRelease(content_lock);
UnpinBuffer(buf_hdr);
goto again;
}
}
```
It only retries when XLogNeedsFlush(lsn) is true.

In the patch, everything are merged into StrategyRejectBuffer():
```
if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
{
LWLockRelease(content_lock);
UnpinBuffer(buf_hdr);
continue;
}
```
When StrategyRejectBuffer(strategy, buf_hdr, from_ring) is true, retry happens. However, look into the function:
```
bool
StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
{
XLogRecPtr lsn;

if (!strategy)
return false;

/* We only do this in bulkread mode */
if (strategy->btype != BAS_BULKREAD)
return false;

/* Don't muck with behavior of normal buffer-replacement strategy */
if (!from_ring ||
strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
return false;

LockBufHdr(buf);
lsn = BufferGetLSN(buf);
UnlockBufHdr(buf);

if (XLogNeedsFlush(lsn))
return false;
```

When XLogNeedsFlush(lsn) is true, StrategyRejectBuffer returns false, thus no retry will happen, which is different from the old logic, is that an intentional change?

Best regards,
--
Chao Li (Evan)
HighGo Software Co., Ltd.
https://www.highgo.com/

#12

Melanie Plageman

melanieplageman@gmail.com

about 2 months ago

In reply to: Chao Li (#11)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

On Tue, Nov 18, 2025 at 8:46 PM Chao Li <li.evan.chao@gmail.com> wrote:

When XLogNeedsFlush(lsn) is true, StrategyRejectBuffer returns false, thus no retry will happen, which is different from the old logic, is that an intentional change?

No, this is a mistake. You are correct. I thought I had fixed this in
an earlier version, but somehow it is still like this.
I've gone with correcting it like this (in attached v8)

if (!XLogNeedsFlush(lsn))
return false;

/*
* Remove the dirty buffer from the ring; necessary to prevent an infinite
* loop if all ring members are dirty.
*/
strategy->buffers[strategy->current] = InvalidBuffer;
return true;

But perhaps the suggestion I think you made earlier is better, dunno

if (!XLogNeedsFlush(lsn))
{
/*
* Remove the dirty buffer from the ring; necessary to prevent
an infinite
* loop if all ring members are dirty.
*/
strategy->buffers[strategy->current] = InvalidBuffer;
return true;
}
return false;

- Melanie

Attachments:

v8-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v8-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From 98b5bf6872b5c8b1e2d4f42c7d8a7e01a1c7858f Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:53:48 -0400
Subject: [PATCH v8 1/7] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  19 ++-
 src/include/storage/buf_internals.h   |   5 +
 3 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 327ddb7adc8..90c24b8d93d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2331,125 +2327,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		/* Attempt to claim a victim buffer. Buffer is returned pinned. */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
+
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
 
-	if (buf_state & BM_VALID)
-	{
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
+
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 28d952b3534..acbabeb3c3b 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -780,12 +781,20 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
  * if this buffer should be written and re-used.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -795,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
+	LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf);
+
+	if (!XLogNeedsFlush(lsn))
+		return false;
+
 	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * Remove the dirty buffer from the ring; necessary to prevent an infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
-
 	return true;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 5400c56a965..04fdea64f83 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -486,6 +486,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

v8-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v8-0002-Split-FlushBuffer-into-two-parts.patchDownload

From 7bf89e41ee440bf7515d49058abdc3c6bc639828 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:54:19 -0400
Subject: [PATCH v8 2/7] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 142 +++++++++++++++++++---------
 1 file changed, 98 insertions(+), 44 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 90c24b8d93d..235e261738b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -533,6 +533,12 @@ static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
+						  IOObject io_object, IOContext io_context,
+						  XLogRecPtr buffer_lsn);
+static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
+							  IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2394,12 +2400,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4270,54 +4272,64 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
-	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held exclusively, and the
+ * buffer header spinlock must not be held. The exclusive lock is released and
+ * the buffer is returned pinned but not locked.
+ *
+ * bufdesc may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc,
+				  bool from_ring, IOContext io_context)
+{
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	LWLock	   *content_lock;
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
+		return;
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	LWLockRelease(content_lock);
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
 
-	buf_state = LockBufHdr(buf);
+/*
+ * Prepare the buffer with bufdesc for writing. Returns true if the buffer
+ * acutally needs writing and false otherwise. lsn returns the buffer's LSN if
+ * the table is logged.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
 
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	UnlockBufHdrExt(buf, buf_state,
-					0, BM_JUST_DIRTIED,
-					0);
+	*lsn = InvalidXLogRecPtr;
+	buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4330,9 +4342,51 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
 	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	UnlockBufHdrExt(bufdesc, buf_state,
+					0, BM_JUST_DIRTIED,
+					0);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (XLogRecPtrIsValid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v8-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v8-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From cc26976f233bc9bb02915c8dbd6deb5dc28244ea Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 13:15:43 -0400
Subject: [PATCH v8 3/7] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes, as it lines up multiple
buffers that can be issued asynchronously once the infrastructure
exists.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 237 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  48 ++++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 281 insertions(+), 8 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 235e261738b..57a3eae865e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,14 +531,25 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
+
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *NextStrategyBufToFlush(BufferAccessStrategy strategy,
+										  Buffer sweep_end,
+										  XLogRecPtr *lsn, int *sweep_cursor);
+
+static bool BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator, bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc,
+							   XLogRecPtr *lsn);
 static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						  IOObject io_object, IOContext io_context,
 						  XLogRecPtr buffer_lsn);
-static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
-							  IOContext io_context);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2401,7 +2412,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4278,6 +4289,61 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns true if the buffer needs WAL flushed before it can be written out.
+ * Caller must not already hold the buffer header spinlock.
+ */
+static bool
+BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state = LockBufHdr(bufdesc);
+
+	*lsn = BufferGetLSN(bufdesc);
+
+	UnlockBufHdr(bufdesc);
+
+	/*
+	 * See buffer flushing code for more details on why we condition this on
+	 * the relation being logged.
+	 */
+	return buf_state & BM_PERMANENT && XLogNeedsFlush(*lsn);
+}
+
+
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStrategyBufToFlush(BufferAccessStrategy strategy,
+					   Buffer sweep_end,
+					   XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategyNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4288,21 +4354,176 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc,
 				  bool from_ring, IOContext io_context)
 {
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 	LWLock	   *content_lock;
+	bool		first_buffer = true;
 
 	/* Set up this victim buffer to be flushed */
 	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
 		return;
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategyGetCurrentIndex(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			content_lock = BufferDescriptorGetContentLock(bufdesc);
+			LWLockRelease(content_lock);
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStrategyBufToFlush(strategy, sweep_end,
+												   &max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		content_lock = BufferDescriptorGetContentLock(bufdesc);
+		LWLockRelease(content_lock);
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the buffer descriptor of the buffer to eagerly flush,
+ * pinned and locked, or NULL if this buffer does not contain a block that
+ * should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+static BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		old_buf_state;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/*
+	 * Ensure that theres a free refcount entry and resource owner slot for
+	 * the pin before pinning the buffer. While this may leake a refcount and
+	 * slot if we return without a buffer, we should use that slot the next
+	 * time we try and reserve a spot.
+	 */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether the buffer can be used and pin it if so. Do this using a
+	 * CAS loop, to avoid having to lock the buffer header. We have to lock
+	 * the buffer header later if we succeed in pinning the buffer here, but
+	 * avoiding locking the buffer header if the buffer is in use is worth it.
+	 */
+	old_buf_state = pg_atomic_read_u32(&bufdesc->state);
+
+	for (;;)
+	{
+		buf_state = old_buf_state;
+
+		if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+			return NULL;
+
+		/* We don't eagerly flush buffers used by others */
+		if (skip_pinned &&
+			(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+			 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+			return NULL;
+
+		if (unlikely(buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(bufdesc);
+			continue;
+		}
+
+		/* pin the buffer if the CAS succeeds */
+		buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&bufdesc->state, &old_buf_state,
+										   buf_state))
+		{
+			TrackNewBufferPin(BufferDescriptorGetBuffer(bufdesc));
+			break;
+		}
+	}
+
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
+	/* Don't eagerly flush buffers requiring WAL flush */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
 	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+
+	buf_state = LockBufHdr(bufdesc);
+	UnlockBufHdrExt(bufdesc, buf_state, 0, BM_JUST_DIRTIED, 0);
+
+	return bufdesc;
+
+except_unlock_content:
 	LWLockRelease(content_lock);
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index acbabeb3c3b..4a3009d190c 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -307,6 +332,29 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Given a position in the ring, cursor, increment the position, and return
+ * the buffer at this position.
+ */
+Buffer
+StrategyNextBuffer(BufferAccessStrategy strategy, int *cursor)
+{
+	if (++(*cursor) >= strategy->nbuffers)
+		*cursor = 0;
+
+	return strategy->buffers[*cursor];
+}
+
+/*
+ * Return the current slot in the strategy ring.
+ */
+int
+StrategyGetCurrentIndex(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 04fdea64f83..c07e309a288 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -506,6 +506,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategyNextBuffer(BufferAccessStrategy strategy,
+								 int *cursor);
+extern int	StrategyGetCurrentIndex(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0

#13

Melanie Plageman

melanieplageman@gmail.com

about 2 months ago

In reply to: Melanie Plageman (#12)

3 attachment(s)

Re: Eagerly evict bulkwrite strategy ring

Chao Li found a mistake I made in not releasing the buffer content
lock in one code path which he reported in another thread [1]/messages/by-id/20E76846-4816-45BC-84F0-D8836D9AFA4C@gmail.com. This
patch set shares some patches with the patches in that thread, so
attached is a corrected v9.

- Melanie

[1]: /messages/by-id/20E76846-4816-45BC-84F0-D8836D9AFA4C@gmail.com

Attachments:

v9-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchtext/x-patch; charset=US-ASCII; name=v9-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patchDownload

From 171fc1e7c2d4a1d4b8fccf338a1203d7fcff0b7f Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:53:48 -0400
Subject: [PATCH v9 1/7] Refactor goto into for loop in GetVictimBuffer()

GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.

This commit is only a refactor and does not introduce any new
functionality.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 189 ++++++++++++--------------
 src/backend/storage/buffer/freelist.c |  19 ++-
 src/include/storage/buf_internals.h   |   5 +
 3 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 327ddb7adc8..90c24b8d93d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
 #include "utils/timestamp.h"
 
 
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
-
 /* Note: this macro only works on local buffers, not shared ones! */
 #define LocalBufHdrGetBlock(bufHdr) \
 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2331,125 +2327,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
-	/*
-	 * We shouldn't have any other pins for this buffer.
-	 */
-	CheckBufferIsPinnedOnce(buf);
-
-	/*
-	 * If the buffer was dirty, try to write it out.  There is a race
-	 * condition here, in that someone might dirty it after we released the
-	 * buffer header lock above, or even while we are writing it out (since
-	 * our share-lock won't prevent hint-bit updates).  We will recheck the
-	 * dirty bit after re-locking the buffer header.
-	 */
-	if (buf_state & BM_DIRTY)
+	/* Select a victim buffer using an optimistic locking scheme. */
+	for (;;)
 	{
-		LWLock	   *content_lock;
 
-		Assert(buf_state & BM_TAG_VALID);
-		Assert(buf_state & BM_VALID);
+		/* Attempt to claim a victim buffer. Buffer is returned pinned. */
+		buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+		buf = BufferDescriptorGetBuffer(buf_hdr);
 
 		/*
-		 * We need a share-lock on the buffer contents to write it out (else
-		 * we might write invalid data, eg because someone else is compacting
-		 * the page contents while we write).  We must use a conditional lock
-		 * acquisition here to avoid deadlock.  Even though the buffer was not
-		 * pinned (and therefore surely not locked) when StrategyGetBuffer
-		 * returned it, someone else could have pinned and exclusive-locked it
-		 * by the time we get here. If we try to get the lock unconditionally,
-		 * we'd block waiting for them; if they later block waiting for us,
-		 * deadlock ensues. (This has been observed to happen when two
-		 * backends are both trying to split btree index pages, and the second
-		 * one just happens to be trying to split the page the first one got
-		 * from StrategyGetBuffer.)
+		 * We shouldn't have any other pins for this buffer.
 		 */
-		content_lock = BufferDescriptorGetContentLock(buf_hdr);
-		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
-		{
-			/*
-			 * Someone else has locked the buffer, so give it up and loop back
-			 * to get another one.
-			 */
-			UnpinBuffer(buf_hdr);
-			goto again;
-		}
+		CheckBufferIsPinnedOnce(buf);
 
 		/*
-		 * If using a nondefault strategy, and writing the buffer would
-		 * require a WAL flush, let the strategy decide whether to go ahead
-		 * and write/reuse the buffer or to choose another victim.  We need a
-		 * lock to inspect the page LSN, so this can't be done inside
-		 * StrategyGetBuffer.
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * the buffer header lock above, or even while we are writing it out
+		 * (since our share-lock won't prevent hint-bit updates).  We will
+		 * recheck the dirty bit after re-locking the buffer header.
 		 */
-		if (strategy != NULL)
+		if (buf_state & BM_DIRTY)
 		{
-			XLogRecPtr	lsn;
+			LWLock	   *content_lock;
 
-			/* Read the LSN while holding buffer header lock */
-			buf_state = LockBufHdr(buf_hdr);
-			lsn = BufferGetLSN(buf_hdr);
-			UnlockBufHdr(buf_hdr);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(buf_state & BM_VALID);
+
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			content_lock = BufferDescriptorGetContentLock(buf_hdr);
+			if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf_hdr);
+				continue;
+			}
 
-			if (XLogNeedsFlush(lsn)
-				&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+			/*
+			 * If using a nondefault strategy, and writing the buffer would
+			 * require a WAL flush, let the strategy decide whether to go
+			 * ahead and write/reuse the buffer or to choose another victim.
+			 * We need the content lock to inspect the page LSN, so this can't
+			 * be done inside StrategyGetBuffer.
+			 */
+			if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
 			{
 				LWLockRelease(content_lock);
 				UnpinBuffer(buf_hdr);
-				goto again;
+				continue;
 			}
-		}
 
-		/* OK, do the I/O */
-		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LWLockRelease(content_lock);
+			/* OK, do the I/O */
+			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+			LWLockRelease(content_lock);
 
-		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-									  &buf_hdr->tag);
-	}
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &buf_hdr->tag);
+		}
 
 
-	if (buf_state & BM_VALID)
-	{
+		if (buf_state & BM_VALID)
+		{
+			/*
+			 * When a BufferAccessStrategy is in use, blocks evicted from
+			 * shared buffers are counted as IOOP_EVICT in the corresponding
+			 * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+			 * by a strategy in two cases: 1) while initially claiming buffers
+			 * for the strategy ring 2) to replace an existing strategy ring
+			 * buffer because it is pinned or in use and cannot be reused.
+			 *
+			 * Blocks evicted from buffers already in the strategy ring are
+			 * counted as IOOP_REUSE in the corresponding strategy context.
+			 *
+			 * At this point, we can accurately count evictions and reuses,
+			 * because we have successfully claimed the valid buffer.
+			 * Previously, we may have been forced to release the buffer due
+			 * to concurrent pinners or erroring out.
+			 */
+			pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+							   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+		}
+
 		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer. Previously,
-		 * we may have been forced to release the buffer due to concurrent
-		 * pinners or erroring out.
+		 * If the buffer has an entry in the buffer mapping table, delete it.
+		 * This can fail because another backend could have pinned or dirtied
+		 * the buffer.
 		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+		if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+		{
+			UnpinBuffer(buf_hdr);
+			continue;
+		}
 
-	/*
-	 * If the buffer has an entry in the buffer mapping table, delete it. This
-	 * can fail because another backend could have pinned or dirtied the
-	 * buffer.
-	 */
-	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
-	{
-		UnpinBuffer(buf_hdr);
-		goto again;
+		break;
 	}
 
 	/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 28d952b3534..acbabeb3c3b 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
@@ -780,12 +781,20 @@ IOContextForStrategy(BufferAccessStrategy strategy)
  * be written out and doing so would require flushing WAL too.  This gives us
  * a chance to choose a different victim.
  *
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
  * Returns true if buffer manager should ask for a new victim, and false
  * if this buffer should be written and re-used.
  */
 bool
 StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
 {
+	XLogRecPtr	lsn;
+
+	if (!strategy)
+		return false;
+
 	/* We only do this in bulkread mode */
 	if (strategy->btype != BAS_BULKREAD)
 		return false;
@@ -795,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
 		return false;
 
+	LockBufHdr(buf);
+	lsn = BufferGetLSN(buf);
+	UnlockBufHdr(buf);
+
+	if (!XLogNeedsFlush(lsn))
+		return false;
+
 	/*
-	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * Remove the dirty buffer from the ring; necessary to prevent an infinite
 	 * loop if all ring members are dirty.
 	 */
 	strategy->buffers[strategy->current] = InvalidBuffer;
-
 	return true;
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 5400c56a965..04fdea64f83 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -486,6 +486,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 /*
  * Internal buffer management routines
  */
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
 /* bufmgr.c */
 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
-- 
2.43.0

v9-0002-Split-FlushBuffer-into-two-parts.patchtext/x-patch; charset=US-ASCII; name=v9-0002-Split-FlushBuffer-into-two-parts.patchDownload

From 8d2ec82321118daaabbac64785f4de5f7ef9d298 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 10:54:19 -0400
Subject: [PATCH v9 2/7] Split FlushBuffer() into two parts

Before adding write combining to write a batch of blocks when flushing
dirty buffers, refactor FlushBuffer() into the preparatory step and
actual buffer flushing step. This separation procides symmetry with
future code for batch flushing which necessarily separates these steps,
as it must prepare multiple buffers before flushing them together.

These steps are moved into a new FlushBuffer() helper function,
CleanVictimBuffer() which will contain both the batch flushing and
single flush code in future commits.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 143 +++++++++++++++++++---------
 1 file changed, 99 insertions(+), 44 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 90c24b8d93d..c88ff08493c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -533,6 +533,12 @@ static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
+						  IOObject io_object, IOContext io_context,
+						  XLogRecPtr buffer_lsn);
+static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
+							  IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2394,12 +2400,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 				continue;
 			}
 
-			/* OK, do the I/O */
-			FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-			LWLockRelease(content_lock);
-
-			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-										  &buf_hdr->tag);
+			/* Content lock is released inside CleanVictimBuffer */
+			CleanVictimBuffer(buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4270,54 +4272,65 @@ static void
 FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 			IOContext io_context)
 {
-	XLogRecPtr	recptr;
-	ErrorContextCallback errcallback;
-	instr_time	io_start;
-	Block		bufBlock;
-	char	   *bufToWrite;
-	uint32		buf_state;
+	XLogRecPtr	lsn;
 
-	/*
-	 * Try to start an I/O operation.  If StartBufferIO returns false, then
-	 * someone else flushed the buffer before we could, so we need not do
-	 * anything.
-	 */
-	if (!StartBufferIO(buf, false, false))
-		return;
+	if (PrepareFlushBuffer(buf, &lsn))
+		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
+}
 
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = shared_buffer_write_error_callback;
-	errcallback.arg = buf;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
+/*
+ * Prepare and write out a dirty victim buffer.
+ *
+ * Buffer must be pinned, the content lock must be held, and the buffer header
+ * spinlock must not be held. The content lock is released and the buffer is
+ * returned pinned but not locked.
+ *
+ * bufdesc may be modified.
+ */
+static void
+CleanVictimBuffer(BufferDesc *bufdesc,
+				  bool from_ring, IOContext io_context)
+{
+	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
 
-	/* Find smgr relation for buffer */
-	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+	/* Set up this victim buffer to be flushed */
+	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
+	{
+		LWLockRelease(BufferDescriptorGetContentLock(bufdesc));
+		return;
+	}
 
-	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
-										buf->tag.blockNum,
-										reln->smgr_rlocator.locator.spcOid,
-										reln->smgr_rlocator.locator.dbOid,
-										reln->smgr_rlocator.locator.relNumber);
+	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+	LWLockRelease(BufferDescriptorGetContentLock(bufdesc));
+	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+								  &bufdesc->tag);
+}
 
-	buf_state = LockBufHdr(buf);
+/*
+ * Prepare the buffer with bufdesc for writing. Returns true if the buffer
+ * acutally needs writing and false otherwise. lsn returns the buffer's LSN if
+ * the table is logged.
+ */
+static bool
+PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
 
 	/*
-	 * Run PageGetLSN while holding header lock, since we don't have the
-	 * buffer locked exclusively in all cases.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
-	recptr = BufferGetLSN(buf);
+	if (!StartBufferIO(bufdesc, false, false))
+		return false;
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	UnlockBufHdrExt(buf, buf_state,
-					0, BM_JUST_DIRTIED,
-					0);
+	*lsn = InvalidXLogRecPtr;
+	buf_state = LockBufHdr(bufdesc);
 
 	/*
-	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * rule that log updates must hit disk before any of the data-file changes
-	 * they describe do.
+	 * Record the buffer's LSN. We will force XLOG flush up to buffer's LSN.
+	 * This implements the basic WAL rule that log updates must hit disk
+	 * before any of the data-file changes they describe do.
 	 *
 	 * However, this rule does not apply to unlogged relations, which will be
 	 * lost after a crash anyway.  Most unlogged relation pages do not bear
@@ -4330,9 +4343,51 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * happen, attempting to flush WAL through that location would fail, with
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
+	 *
+	 * We must hold the buffer header lock when examining the page LSN since
+	 * don't have buffer exclusively locked in all cases.
 	 */
 	if (buf_state & BM_PERMANENT)
-		XLogFlush(recptr);
+		*lsn = BufferGetLSN(bufdesc);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	UnlockBufHdrExt(bufdesc, buf_state,
+					0, BM_JUST_DIRTIED,
+					0);
+	return true;
+}
+
+/*
+ * Actually do the write I/O to clean a buffer. buf and reln may be modified.
+ */
+static void
+DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+			  IOContext io_context, XLogRecPtr buffer_lsn)
+{
+	ErrorContextCallback errcallback;
+	instr_time	io_start;
+	Block		bufBlock;
+	char	   *bufToWrite;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+										buf->tag.blockNum,
+										reln->smgr_rlocator.locator.spcOid,
+										reln->smgr_rlocator.locator.dbOid,
+										reln->smgr_rlocator.locator.relNumber);
+
+	/* Force XLOG flush up to buffer's LSN */
+	if (XLogRecPtrIsValid(buffer_lsn))
+		XLogFlush(buffer_lsn);
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
-- 
2.43.0

v9-0003-Eagerly-flush-bulkwrite-strategy-ring.patchtext/x-patch; charset=US-ASCII; name=v9-0003-Eagerly-flush-bulkwrite-strategy-ring.patchDownload

From ddb6b8e78b56d331707c22c7d37af4087182e83c Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 15 Oct 2025 13:15:43 -0400
Subject: [PATCH v9 3/7] Eagerly flush bulkwrite strategy ring

Operations using BAS_BULKWRITE (COPY FROM and createdb) will inevitably
need to flush buffers in the strategy ring in order to reuse them. By
eagerly flushing the buffers in a larger run, we encourage larger writes
at the kernel level and less interleaving of WAL flushes and data file
writes. The effect is mainly noticeable with multiple parallel COPY
FROMs. In this case, client backends achieve higher write throughput and
end up spending less time waiting on acquiring the lock to flush WAL.
Larger flush operations also mean less time waiting for flush operations
at the kernel level.

The heuristic for eager eviction is to only flush buffers in the
strategy ring which do not require a WAL flush.

This patch also is a step toward AIO writes, as it lines up multiple
buffers that can be issued asynchronously once the infrastructure
exists.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Earlier version Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 249 +++++++++++++++++++++++++-
 src/backend/storage/buffer/freelist.c |  48 +++++
 src/include/storage/buf_internals.h   |   4 +
 3 files changed, 292 insertions(+), 9 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c88ff08493c..9b39129da42 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,14 +531,25 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
 								IOObject io_object, IOContext io_context);
+
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
-static bool PrepareFlushBuffer(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *NextStrategyBufToFlush(BufferAccessStrategy strategy,
+										  Buffer sweep_end,
+										  XLogRecPtr *lsn, int *sweep_cursor);
+
+static bool BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn);
+static BufferDesc *PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+												   RelFileLocator *rlocator, bool skip_pinned,
+												   XLogRecPtr *max_lsn);
+static bool PrepareFlushBuffer(BufferDesc *bufdesc,
+							   XLogRecPtr *lsn);
 static void DoFlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						  IOObject io_object, IOContext io_context,
 						  XLogRecPtr buffer_lsn);
-static void CleanVictimBuffer(BufferDesc *bufdesc, bool from_ring,
-							  IOContext io_context);
+static void CleanVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *bufdesc,
+							  bool from_ring, IOContext io_context);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -2401,7 +2412,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 			}
 
 			/* Content lock is released inside CleanVictimBuffer */
-			CleanVictimBuffer(buf_hdr, from_ring, io_context);
+			CleanVictimBuffer(strategy, buf_hdr, from_ring, io_context);
 		}
 
 
@@ -4278,6 +4289,69 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 		DoFlushBuffer(buf, reln, io_object, io_context, lsn);
 }
 
+/*
+ * Returns true if the buffer needs WAL flushed before it can be written out.
+ * Caller must not already hold the buffer header spinlock. If the buffer is
+ * unlogged, *lsn shouldn't be used by the caller and is set to
+ * InvalidXLogRecPtr.
+ */
+static bool
+BufferNeedsWALFlush(BufferDesc *bufdesc, XLogRecPtr *lsn)
+{
+	uint32		buf_state;
+
+	buf_state = LockBufHdr(bufdesc);
+	*lsn = BufferGetLSN(bufdesc);
+	UnlockBufHdr(bufdesc);
+
+	/*
+	 * See buffer flushing code for more details on why we condition this on
+	 * the relation being logged.
+	 */
+	if (!(buf_state & BM_PERMANENT))
+	{
+		*lsn = InvalidXLogRecPtr;
+		return false;
+	}
+
+	return XLogNeedsFlush(*lsn);
+}
+
+
+/*
+ * Returns the buffer descriptor of the buffer containing the next block we
+ * should eagerly flush or NULL when there are no further buffers to consider
+ * writing out.
+ */
+static BufferDesc *
+NextStrategyBufToFlush(BufferAccessStrategy strategy,
+					   Buffer sweep_end,
+					   XLogRecPtr *lsn, int *sweep_cursor)
+{
+	Buffer		bufnum;
+	BufferDesc *bufdesc;
+
+	while ((bufnum =
+			StrategyNextBuffer(strategy, sweep_cursor)) != sweep_end)
+	{
+		/*
+		 * For BAS_BULKWRITE, once you hit an InvalidBuffer, the remaining
+		 * buffers in the ring will be invalid.
+		 */
+		if (!BufferIsValid(bufnum))
+			break;
+
+		if ((bufdesc = PrepareOrRejectEagerFlushBuffer(bufnum,
+													   InvalidBlockNumber,
+													   NULL,
+													   true,
+													   lsn)) != NULL)
+			return bufdesc;
+	}
+
+	return NULL;
+}
+
 /*
  * Prepare and write out a dirty victim buffer.
  *
@@ -4288,10 +4362,12 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
  * bufdesc may be modified.
  */
 static void
-CleanVictimBuffer(BufferDesc *bufdesc,
+CleanVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *bufdesc,
 				  bool from_ring, IOContext io_context)
 {
 	XLogRecPtr	max_lsn = InvalidXLogRecPtr;
+	bool		first_buffer = true;
 
 	/* Set up this victim buffer to be flushed */
 	if (!PrepareFlushBuffer(bufdesc, &max_lsn))
@@ -4300,10 +4376,162 @@ CleanVictimBuffer(BufferDesc *bufdesc,
 		return;
 	}
 
-	DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
-	LWLockRelease(BufferDescriptorGetContentLock(bufdesc));
-	ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
-								  &bufdesc->tag);
+	if (from_ring && StrategySupportsEagerFlush(strategy))
+	{
+		Buffer		sweep_end = BufferDescriptorGetBuffer(bufdesc);
+		int			cursor = StrategyGetCurrentIndex(strategy);
+
+		/* Clean victim buffer and find more to flush opportunistically */
+		do
+		{
+			DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+			LWLockRelease(BufferDescriptorGetContentLock(bufdesc));
+			ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+										  &bufdesc->tag);
+			/* We leave the first buffer pinned for the caller */
+			if (!first_buffer)
+				UnpinBuffer(bufdesc);
+			first_buffer = false;
+		} while ((bufdesc = NextStrategyBufToFlush(strategy, sweep_end,
+												   &max_lsn, &cursor)) != NULL);
+	}
+	else
+	{
+		DoFlushBuffer(bufdesc, NULL, IOOBJECT_RELATION, io_context, max_lsn);
+		LWLockRelease(BufferDescriptorGetContentLock(bufdesc));
+		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+									  &bufdesc->tag);
+	}
+}
+
+/*
+ * Prepare bufdesc for eager flushing.
+ *
+ * Given bufnum, return the buffer descriptor of the buffer to eagerly flush,
+ * pinned and locked, or NULL if this buffer does not contain a block that
+ * should be flushed.
+ *
+ * require is the BlockNumber required by the caller. Some callers may require
+ * a specific BlockNumber to be in bufnum because they are assembling a
+ * contiguous run of blocks.
+ *
+ * If the caller needs the block to be from a specific relation, rlocator will
+ * be provided.
+ */
+static BufferDesc *
+PrepareOrRejectEagerFlushBuffer(Buffer bufnum, BlockNumber require,
+								RelFileLocator *rlocator, bool skip_pinned,
+								XLogRecPtr *max_lsn)
+{
+	BufferDesc *bufdesc;
+	uint32		old_buf_state;
+	uint32		buf_state;
+	XLogRecPtr	lsn;
+	BlockNumber blknum;
+	LWLock	   *content_lock;
+
+	if (!BufferIsValid(bufnum))
+		return NULL;
+
+	Assert(!BufferIsLocal(bufnum));
+
+	bufdesc = GetBufferDescriptor(bufnum - 1);
+
+	/* Block may need to be in a specific relation */
+	if (rlocator &&
+		!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufdesc->tag),
+							  *rlocator))
+		return NULL;
+
+	/*
+	 * Ensure that theres a free refcount entry and resource owner slot for
+	 * the pin before pinning the buffer. While this may leake a refcount and
+	 * slot if we return without a buffer, we should use that slot the next
+	 * time we try and reserve a spot.
+	 */
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether the buffer can be used and pin it if so. Do this using a
+	 * CAS loop, to avoid having to lock the buffer header. We have to lock
+	 * the buffer header later if we succeed in pinning the buffer here, but
+	 * avoiding locking the buffer header if the buffer is in use is worth it.
+	 */
+	old_buf_state = pg_atomic_read_u32(&bufdesc->state);
+
+	for (;;)
+	{
+		buf_state = old_buf_state;
+
+		if (!(buf_state & BM_DIRTY) || !(buf_state & BM_VALID))
+			return NULL;
+
+		/* We don't eagerly flush buffers used by others */
+		if (skip_pinned &&
+			(BUF_STATE_GET_REFCOUNT(buf_state) > 0 ||
+			 BUF_STATE_GET_USAGECOUNT(buf_state) > 1))
+			return NULL;
+
+		if (unlikely(buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(bufdesc);
+			continue;
+		}
+
+		/* pin the buffer if the CAS succeeds */
+		buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&bufdesc->state, &old_buf_state,
+										   buf_state))
+		{
+			TrackNewBufferPin(BufferDescriptorGetBuffer(bufdesc));
+			break;
+		}
+	}
+
+	CheckBufferIsPinnedOnce(bufnum);
+
+	blknum = BufferGetBlockNumber(bufnum);
+	Assert(BlockNumberIsValid(blknum));
+
+	/* We only include contiguous blocks in the run */
+	if (BlockNumberIsValid(require) && blknum != require)
+		goto except_unpin_buffer;
+
+	/* Don't eagerly flush buffers requiring WAL flush */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	content_lock = BufferDescriptorGetContentLock(bufdesc);
+	if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		goto except_unpin_buffer;
+
+	/*
+	 * Now that we have the content lock, we need to recheck if we need to
+	 * flush WAL.
+	 */
+	if (BufferNeedsWALFlush(bufdesc, &lsn))
+		goto except_unpin_buffer;
+
+	/* Try to start an I/O operation */
+	if (!StartBufferIO(bufdesc, false, true))
+		goto except_unlock_content;
+
+	if (lsn > *max_lsn)
+		*max_lsn = lsn;
+
+	buf_state = LockBufHdr(bufdesc);
+	UnlockBufHdrExt(bufdesc, buf_state, 0, BM_JUST_DIRTIED, 0);
+
+	return bufdesc;
+
+except_unlock_content:
+	LWLockRelease(content_lock);
+
+except_unpin_buffer:
+	UnpinBuffer(bufdesc);
+	return NULL;
 }
 
 /*
@@ -4387,7 +4615,10 @@ DoFlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 
 	/* Force XLOG flush up to buffer's LSN */
 	if (XLogRecPtrIsValid(buffer_lsn))
+	{
+		Assert(pg_atomic_read_u32(&buf->state) & BM_PERMANENT);
 		XLogFlush(buffer_lsn);
+	}
 
 	/*
 	 * Now it's safe to write the buffer to disk. Note that no one else should
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index acbabeb3c3b..4a3009d190c 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -156,6 +156,31 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * Some BufferAccessStrategies support eager flushing -- which is flushing
+ * buffers in the ring before they are needed. This can lead to better I/O
+ * patterns than lazily flushing buffers immediately before reusing them.
+ */
+bool
+StrategySupportsEagerFlush(BufferAccessStrategy strategy)
+{
+	Assert(strategy);
+
+	switch (strategy->btype)
+	{
+		case BAS_BULKWRITE:
+			return true;
+		case BAS_VACUUM:
+		case BAS_NORMAL:
+		case BAS_BULKREAD:
+			return false;
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) strategy->btype);
+			return false;
+	}
+}
+
 /*
  * StrategyGetBuffer
  *
@@ -307,6 +332,29 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * Given a position in the ring, cursor, increment the position, and return
+ * the buffer at this position.
+ */
+Buffer
+StrategyNextBuffer(BufferAccessStrategy strategy, int *cursor)
+{
+	if (++(*cursor) >= strategy->nbuffers)
+		*cursor = 0;
+
+	return strategy->buffers[*cursor];
+}
+
+/*
+ * Return the current slot in the strategy ring.
+ */
+int
+StrategyGetCurrentIndex(BufferAccessStrategy strategy)
+{
+	return strategy->current;
+}
+
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 04fdea64f83..c07e309a288 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -506,6 +506,10 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag
 
 
 /* freelist.c */
+extern bool StrategySupportsEagerFlush(BufferAccessStrategy strategy);
+extern Buffer StrategyNextBuffer(BufferAccessStrategy strategy,
+								 int *cursor);
+extern int	StrategyGetCurrentIndex(BufferAccessStrategy strategy);
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state, bool *from_ring);
-- 
2.43.0