Using read stream in autoprewarm
Hi,
I am working on using the read stream in autoprewarm. I observed ~10%
performance gain with this change. The patch is attached.
The downside of the read stream approach is that a new read stream
object needs to be created for each database, relation and fork. I was
wondering if this would cause a regression but it did not (at least
depending on results of my testing). Another downside could be the
code getting complicated.
For the testing,
- I created 50 databases with each of them having 50 tables and the
size of the tables are 520KB.
- patched: 51157 ms
- master: 56769 ms
- I created 5 databases with each of them having 1 table and the size
of the tables are 3GB.
- patched: 32679 ms
- master: 36706 ms
I put debugging message with timing information in
autoprewarm_database_main() function, then run autoprewarm 100 times
(by restarting the server) and cleared the OS cache before each
restart. Also, I ensured that the block number of the buffer returning
from the read stream API is correct. I am not sure if that much
testing is enough for this kind of change.
Any feedback would be appreciated.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v1-0001-Use-read-stream-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Use-read-stream-in-autoprewarm.patchDownload
From c5e286612912ba6840d967812171162a948153e4 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Wed, 7 Aug 2024 17:27:50 +0300
Subject: [PATCH v1] Use read stream in autoprewarm
Instead of reading blocks with ReadBufferExtended(), create read stream
object for each possible case and use it.
This change provides about 10% performance improvement.
---
contrib/pg_prewarm/autoprewarm.c | 102 +++++++++++++++++++++++++++++--
1 file changed, 97 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index d061731706a..96e93c46f85 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -44,6 +44,7 @@
#include "storage/lwlock.h"
#include "storage/proc.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
@@ -429,6 +430,58 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ bool *rs_have_free_buffer = per_buffer_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ *rs_have_free_buffer = true;
+
+ if (!have_free_buffer())
+ {
+ *rs_have_free_buffer = false;
+ return InvalidBlockNumber;
+ }
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ return p->block_info[p->pos++].blocknum;
+ }
+
+ old_blk = &(p->block_info[p->pos - 1]);
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber &&
+ cur_blk->blocknum < p->nblocks_in_fork)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -442,6 +495,9 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
+ bool *rs_have_free_buffer;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -458,13 +514,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx; pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -477,6 +536,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -513,7 +584,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -525,7 +599,21 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ sizeof(bool));
+ }
else
nblocks = 0;
}
@@ -539,16 +627,20 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, (void **) &rs_have_free_buffer);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+ /* There are no free buffers left in shared buffers, break the loop. */
+ else if (!(*rs_have_free_buffer))
+ break;
old_blk = blk;
}
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
dsm_detach(seg);
--
2.45.2
On 8 Aug 2024, at 11:32, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Any feedback would be appreciated.
I've took a look into the patch. It seems to me that you add new block numbers to the read stream until you have buffers. So when there are no more buffers you will still have some queued blocks.
Maybe can you change the logic so that number of free buffers must be enough to allocate all blocks in look-ahead distance?
Thanks!
Best regards, Andrey Borodin.
Dear Nazir,
At first A quick look it looks good. I will take a closer look at it
tomorrow. Could you please let me know about the performance tests and
graphics?
Best regards, Stepan Neretin!
Hi,
Thanks for looking into this!
On Thu, 31 Oct 2024 at 21:18, Andrey M. Borodin <x4mmm@yandex-team.ru> wrote:
On 8 Aug 2024, at 11:32, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Any feedback would be appreciated.
I've took a look into the patch. It seems to me that you add new block numbers to the read stream until you have buffers. So when there are no more buffers you will still have some queued blocks.
Maybe can you change the logic so that number of free buffers must be enough to allocate all blocks in look-ahead distance?
I see what you mean. When the have_free_buffer() function returns
false in the callback, there are still queued blocks in the stream
although there are no free buffers in the buffer pool. I think the
best way to solve this is to get the number of free buffers in the
buffer pool by 'BufferStrategyControl.lastFreeBuffer -
BufferStrategyControl.firstFreeBuffer' and then compare it with
'stream->pending_read_nblocks'. When the 'stream->pending_read_nblocks
== number_of_free_buffers_in_buffer_pool', end the stream. The problem
with that is stream->pending_read_nblocks isn't public, also I am not
sure whether 'BufferStrategyControl.lastFreeBuffer -
BufferStrategyControl.firstFreeBuffer' is safe to use.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Hi,
Thanks for looking into this!
On Thu, 31 Oct 2024 at 22:02, Stepan Neretin <sndcppg@gmail.com> wrote:
At first A quick look it looks good. I will take a closer look at it tomorrow. Could you please let me know about the performance tests and graphics?
Sorry but I didn't understand what you mean by performance tests and
graphics. Do you want something else than the information in the first
email [1]postgr.es/m/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com?
[1]: postgr.es/m/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
--
Regards,
Nazir Bilal Yavuz
Microsoft
On 1 Nov 2024, at 12:51, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
am not
sure whether 'BufferStrategyControl.lastFreeBuffer -
BufferStrategyControl.firstFreeBuffer' is safe to use.
Ugh... it will work. But it seems to me too dirty hack. There's no scalable way to know size of a free list.
Let's just comment that we might read some more buffers if database does not fit into memory?
Alternatively we can count size of a free list on the start.
Best regards, Andrey Borodin.
Hi,
On Fri, 1 Nov 2024 at 21:06, Andrey M. Borodin <x4mmm@yandex-team.ru> wrote:
On 1 Nov 2024, at 12:51, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
am not
sure whether 'BufferStrategyControl.lastFreeBuffer -
BufferStrategyControl.firstFreeBuffer' is safe to use.Ugh... it will work. But it seems to me too dirty hack. There's no scalable way to know size of a free list.
Let's just comment that we might read some more buffers if database does not fit into memory?
Alternatively we can count size of a free list on the start.
I agree that it is too dirty to hack. There is a minor problem with
the counting size of a free list on the start. There may be other
processes that fill the buffer pool concurrently, so we can still end
up doing unnecessary I/Os. That said, I believe this approach remains
an improvement.
The first patch includes the comment you suggested, and the second
patch implements counting size of a free list on the start.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v2-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchtext/x-patch; charset=US-ASCII; name=v2-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From 2905f74049f77ab9c4f406dfe28b29ef4eafb7b4 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 13:00:11 +0300
Subject: [PATCH v2 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
src/include/storage/buf_internals.h | 1 +
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
contrib/pg_prewarm/autoprewarm.c | 4 ++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index eda6c699212..d95050adb23 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -432,6 +432,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index dffdd57e9b5..6cad8540bd9 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuosly changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 8291bf3c427..16174bf3d0f 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -518,13 +518,13 @@ autoprewarm_database_main(Datum main_arg)
pos = apw_state->prewarm_start_idx;
p.block_info = block_info;
- p.max_pos = apw_state->prewarm_stop_idx;
+ p.max_pos = Min(apw_state->prewarm_stop_idx, pos + get_number_of_free_buffers());
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- for (; pos < apw_state->prewarm_stop_idx; pos++)
+ for (; pos < p.max_pos; pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
--
2.45.2
v2-0001-Use-read-stream-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v2-0001-Use-read-stream-in-autoprewarm.patchDownload
From bcdb55f237945072cb5740392515af701cfdc1d2 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 11:40:14 +0300
Subject: [PATCH v2 1/2] Use read stream in autoprewarm
Instead of reading blocks with ReadBufferExtended(), create read stream
object for each possible case and use it.
This change provides about 10% performance improvement.
---
contrib/pg_prewarm/autoprewarm.c | 112 +++++++++++++++++++++++++++++--
1 file changed, 107 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index fac4051e1aa..8291bf3c427 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -422,6 +423,68 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ bool *rs_have_free_buffer = per_buffer_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ *rs_have_free_buffer = true;
+
+ /*
+ * There may still be queued blocks in the stream even when no free
+ * buffers are available in the buffer pool. This can lead to unnecessary
+ * I/O operations and buffer evictions. One possible solution is to
+ * compare the number of free buffers in the buffer pool with the number
+ * of queued blocks in the stream. However, this approach is considered a
+ * workaround and would add complexity with minimal benefit, as only a few
+ * unnecessary I/O operations and buffer evictions are expected.
+ * Therefore, this solution has not been implemented.
+ */
+ if (!have_free_buffer())
+ {
+ *rs_have_free_buffer = false;
+ return InvalidBlockNumber;
+ }
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ return p->block_info[p->pos++].blocknum;
+ }
+
+ old_blk = &(p->block_info[p->pos - 1]);
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber &&
+ cur_blk->blocknum < p->nblocks_in_fork)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -435,6 +498,9 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
+ bool *rs_have_free_buffer;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,13 +517,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx; pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -470,6 +539,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -506,7 +587,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -518,7 +602,21 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ sizeof(bool));
+ }
else
nblocks = 0;
}
@@ -532,16 +630,20 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, (void **) &rs_have_free_buffer);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+ /* There are no free buffers left in shared buffers, break the loop. */
+ else if (!(*rs_have_free_buffer))
+ break;
old_blk = blk;
}
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
dsm_detach(seg);
--
2.45.2
Hi,
Newer reviewer here, trying to understand more about the read stream API.
On Tuesday, November 26th, 2024 at 11:07 AM, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Any feedback would be appreciated.
I've executed the same test of 5 databases with each of them having 1 table of
3GB of size and I've got very similar results.
I've also tested using a single database with 4 tables with ~60GB of size and
the results compared with master was more closer but still an improvement. Note
that I've also increased the default shared_buffers to 7GB to see how it works
with large buffer pools.
- patched: 5.4259 s
- master: 5.53186 s
Not to much to say about the code, I'm currently learning more about the read
stream API and Postgresql hacking itself. Just some minor points and questions
about the patches.
v2-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patch
--- a/src/backend/storage/buffer/freelist.c
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuosly changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
typo on continuosly -> continuously
v2-0001-Use-read-stream-in-autoprewarm.patch
+ bool *rs_have_free_buffer = per_buffer_data;
+
+
+ *rs_have_free_buffer = true;
+
Not sure if I understand why this variable is needed, it seems that it is only
written and never read? Just as comparison, the block_range_read_stream_cb
callback used on pg_prewarm seems to not use the per_buffer_data parameter.
--
Matheus Alcantara
EDB: https://www.enterprisedb.com
Hi,
Thank you for looking into this!
On Wed, 27 Nov 2024 at 16:50, Matheus Alcantara <mths.dev@pm.me> wrote:
I've executed the same test of 5 databases with each of them having 1 table of
3GB of size and I've got very similar results.I've also tested using a single database with 4 tables with ~60GB of size and
the results compared with master was more closer but still an improvement. Note
that I've also increased the default shared_buffers to 7GB to see how it works
with large buffer pools.
- patched: 5.4259 s
- master: 5.53186 s
Thanks for the testing.
Not to much to say about the code, I'm currently learning more about the read
stream API and Postgresql hacking itself. Just some minor points and questions
about the patches.v2-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patch --- a/src/backend/storage/buffer/freelist.c +/* + * get_number_of_free_buffers -- a lockless way to get the number of free + * buffers in buffer pool. + * + * Note that result continuosly changes as free buffers are moved out by other + * operations. + */ +int +get_number_of_free_buffers(void)typo on continuosly -> continuously
Done.
v2-0001-Use-read-stream-in-autoprewarm.patch + bool *rs_have_free_buffer = per_buffer_data; + + + *rs_have_free_buffer = true; +Not sure if I understand why this variable is needed, it seems that it is only
written and never read? Just as comparison, the block_range_read_stream_cb
callback used on pg_prewarm seems to not use the per_buffer_data parameter.
Actually, it is read in the main loop of the
autoprewarm_database_main() function:
/* There are no free buffers left in shared buffers, break the loop. */
else if (!(*rs_have_free_buffer))
break;
apw_read_stream_next_block() callback function sets
rs_have_free_buffer's value to false when there is no free buffer left
in the shared buffers. And the code above terminates the main loop in
the autoprewarm_database_main() function when it is set to false.
block_range_read_stream_cb() callback is used when the callback only
needs to loop over the block numbers. However, for the autoprewarm
case; the callback function needs to do additional checks so another
callback and the use of this variable are required.
v3 is attached.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v3-0001-Use-read-stream-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v3-0001-Use-read-stream-in-autoprewarm.patchDownload
From f5a8faa6ac3f836784c15b776633e7729d95378d Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 11:40:14 +0300
Subject: [PATCH v3 1/2] Use read stream in autoprewarm
Instead of reading blocks with ReadBufferExtended(), create read stream
object for each possible case and use it.
This change provides about 10% performance improvement.
---
contrib/pg_prewarm/autoprewarm.c | 112 +++++++++++++++++++++++++++++--
1 file changed, 107 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index fac4051e1aa..8291bf3c427 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -422,6 +423,68 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ bool *rs_have_free_buffer = per_buffer_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ *rs_have_free_buffer = true;
+
+ /*
+ * There may still be queued blocks in the stream even when no free
+ * buffers are available in the buffer pool. This can lead to unnecessary
+ * I/O operations and buffer evictions. One possible solution is to
+ * compare the number of free buffers in the buffer pool with the number
+ * of queued blocks in the stream. However, this approach is considered a
+ * workaround and would add complexity with minimal benefit, as only a few
+ * unnecessary I/O operations and buffer evictions are expected.
+ * Therefore, this solution has not been implemented.
+ */
+ if (!have_free_buffer())
+ {
+ *rs_have_free_buffer = false;
+ return InvalidBlockNumber;
+ }
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ return p->block_info[p->pos++].blocknum;
+ }
+
+ old_blk = &(p->block_info[p->pos - 1]);
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber &&
+ cur_blk->blocknum < p->nblocks_in_fork)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -435,6 +498,9 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
+ bool *rs_have_free_buffer;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,13 +517,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx; pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -470,6 +539,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -506,7 +587,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -518,7 +602,21 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ sizeof(bool));
+ }
else
nblocks = 0;
}
@@ -532,16 +630,20 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, (void **) &rs_have_free_buffer);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+ /* There are no free buffers left in shared buffers, break the loop. */
+ else if (!(*rs_have_free_buffer))
+ break;
old_blk = blk;
}
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
dsm_detach(seg);
--
2.45.2
v3-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchtext/x-patch; charset=US-ASCII; name=v3-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From 15345de76b1006cb421d84abe3b03fbe60c754e4 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Wed, 27 Nov 2024 16:53:58 +0300
Subject: [PATCH v3 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
src/include/storage/buf_internals.h | 1 +
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
contrib/pg_prewarm/autoprewarm.c | 4 ++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index eda6c699212..d95050adb23 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -432,6 +432,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index dffdd57e9b5..3259e6280cf 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 8291bf3c427..16174bf3d0f 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -518,13 +518,13 @@ autoprewarm_database_main(Datum main_arg)
pos = apw_state->prewarm_start_idx;
p.block_info = block_info;
- p.max_pos = apw_state->prewarm_stop_idx;
+ p.max_pos = Min(apw_state->prewarm_stop_idx, pos + get_number_of_free_buffers());
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- for (; pos < apw_state->prewarm_stop_idx; pos++)
+ for (; pos < p.max_pos; pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
--
2.45.2
On Wednesday, November 27th, 2024 at 11:19 AM, Nazir Bilal Yavuz
<byavuz81@gmail.com> wrote:
v2-0001-Use-read-stream-in-autoprewarm.patch + bool *rs_have_free_buffer = per_buffer_data; + + + *rs_have_free_buffer = true; +Not sure if I understand why this variable is needed, it seems that
it is only
written and never read? Just as comparison, the
block_range_read_stream_cb
callback used on pg_prewarm seems to not use the per_buffer_data
parameter.
Actually, it is read in the main loop of the
autoprewarm_database_main() function:/* There are no free buffers left in shared buffers, break the loop. */
else if (!(*rs_have_free_buffer))
break;apw_read_stream_next_block() callback function sets
rs_have_free_buffer's value to false when there is no free buffer left
in the shared buffers. And the code above terminates the main loop in
the autoprewarm_database_main() function when it is set to false.block_range_read_stream_cb() callback is used when the callback only
needs to loop over the block numbers. However, for the autoprewarm
case; the callback function needs to do additional checks so another
callback and the use of this variable are required.
Ohh, I see, thanks very much for the explanation.
v3 is attached.
Thanks.
I don't know if there is another way that this patch could be tested?
Looking
forward on other reviews on this.
--
Matheus Alcantara
EDB: https://www.enterprisedb.com
On Wed, 27 Nov 2024 at 19:20, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Hi,
Thank you for looking into this!
On Wed, 27 Nov 2024 at 16:50, Matheus Alcantara <mths.dev@pm.me> wrote:
I've executed the same test of 5 databases with each of them having 1 table of
3GB of size and I've got very similar results.I've also tested using a single database with 4 tables with ~60GB of size and
the results compared with master was more closer but still an improvement. Note
that I've also increased the default shared_buffers to 7GB to see how it works
with large buffer pools.
- patched: 5.4259 s
- master: 5.53186 sThanks for the testing.
Not to much to say about the code, I'm currently learning more about the read
stream API and Postgresql hacking itself. Just some minor points and questions
about the patches.v2-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patch --- a/src/backend/storage/buffer/freelist.c +/* + * get_number_of_free_buffers -- a lockless way to get the number of free + * buffers in buffer pool. + * + * Note that result continuosly changes as free buffers are moved out by other + * operations. + */ +int +get_number_of_free_buffers(void)typo on continuosly -> continuously
Done.
v2-0001-Use-read-stream-in-autoprewarm.patch + bool *rs_have_free_buffer = per_buffer_data; + + + *rs_have_free_buffer = true; +Not sure if I understand why this variable is needed, it seems that it is only
written and never read? Just as comparison, the block_range_read_stream_cb
callback used on pg_prewarm seems to not use the per_buffer_data parameter.Actually, it is read in the main loop of the
autoprewarm_database_main() function:/* There are no free buffers left in shared buffers, break the loop. */
else if (!(*rs_have_free_buffer))
break;apw_read_stream_next_block() callback function sets
rs_have_free_buffer's value to false when there is no free buffer left
in the shared buffers. And the code above terminates the main loop in
the autoprewarm_database_main() function when it is set to false.block_range_read_stream_cb() callback is used when the callback only
needs to loop over the block numbers. However, for the autoprewarm
case; the callback function needs to do additional checks so another
callback and the use of this variable are required.v3 is attached.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Hi!
+ old_blk = &(p->block_info[p->pos - 1]); + cur_blk = &(p->block_info[p->pos]);
Should we Assert(p->pos > 0 && p->pos < *something*)
Patch tested with no regression.
--
Best regards,
Kirill Reshke
Hi,
Thank you for looking into this.
On Fri, 29 Nov 2024 at 06:55, Kirill Reshke <reshkekirill@gmail.com> wrote:
+ old_blk = &(p->block_info[p->pos - 1]); + cur_blk = &(p->block_info[p->pos]);Should we Assert(p->pos > 0 && p->pos < *something*)
I think it is worth adding:
+ Assert(p->pos > 0 && p->pos < p->max_pos);
+
+ old_blk = &(p->block_info[p->pos - 1]);
+ cur_blk = &(p->block_info[p->pos]);
v4 is attached.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v4-0001-Optimize-autoprewarm-with-read-streams.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Optimize-autoprewarm-with-read-streams.patchDownload
From ffd37cc5010d2253b3f843e891df0e75ee8d27a6 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 11:40:14 +0300
Subject: [PATCH v4 1/2] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 114 +++++++++++++++++++++++++++++--
1 file changed, 109 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index fac4051e1aa..e7a8213209a 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -422,6 +423,70 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ bool *rs_have_free_buffer = per_buffer_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ *rs_have_free_buffer = true;
+
+ /*
+ * There may still be queued blocks in the stream even when no free
+ * buffers are available in the buffer pool. This can lead to unnecessary
+ * I/O operations and buffer evictions. One possible solution is to
+ * compare the number of free buffers in the buffer pool with the number
+ * of queued blocks in the stream. However, this approach is considered a
+ * workaround and would add complexity with minimal benefit, as only a few
+ * unnecessary I/O operations and buffer evictions are expected.
+ * Therefore, this solution has not been implemented.
+ */
+ if (!have_free_buffer())
+ {
+ *rs_have_free_buffer = false;
+ return InvalidBlockNumber;
+ }
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ return p->block_info[p->pos++].blocknum;
+ }
+
+ Assert(p->pos > 0 && p->pos < p->max_pos);
+
+ old_blk = &(p->block_info[p->pos - 1]);
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber &&
+ cur_blk->blocknum < p->nblocks_in_fork)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -435,6 +500,9 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
+ bool *rs_have_free_buffer;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,13 +519,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx; pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -470,6 +541,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -506,7 +589,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -518,7 +604,21 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ sizeof(bool));
+ }
else
nblocks = 0;
}
@@ -532,16 +632,20 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, (void **) &rs_have_free_buffer);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+ /* There are no free buffers left in shared buffers, break the loop. */
+ else if (!(*rs_have_free_buffer))
+ break;
old_blk = blk;
}
+ Assert(read_stream_next_buffer(stream, (void **) &rs_have_free_buffer) == InvalidBuffer);
+ read_stream_end(stream);
dsm_detach(seg);
--
2.45.2
v4-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchtext/x-patch; charset=US-ASCII; name=v4-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From d35f774b03c8436c0e108b1b445ef61c00eb7180 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Wed, 27 Nov 2024 16:53:58 +0300
Subject: [PATCH v4 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
src/include/storage/buf_internals.h | 1 +
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
contrib/pg_prewarm/autoprewarm.c | 4 ++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index eda6c699212..d95050adb23 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -432,6 +432,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index dffdd57e9b5..3259e6280cf 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index e7a8213209a..1b5a53e63bf 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -520,13 +520,13 @@ autoprewarm_database_main(Datum main_arg)
pos = apw_state->prewarm_start_idx;
p.block_info = block_info;
- p.max_pos = apw_state->prewarm_stop_idx;
+ p.max_pos = Min(apw_state->prewarm_stop_idx, pos + get_number_of_free_buffers());
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- for (; pos < apw_state->prewarm_stop_idx; pos++)
+ for (; pos < p.max_pos; pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
--
2.45.2
On Fri, 29 Nov 2024 at 16:19, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
v4 is attached.
Hi!
I feel like we are ready to mark this as RFC, WDYT?
--
Best regards,
Kirill Reshke
On 2 Dec 2024, at 16:16, Kirill Reshke <reshkekirill@gmail.com> wrote:
I feel like we are ready to mark this as RFC, WDYT?
+1
Best regards, Andrey Borodin.
Hi,
On Mon, 2 Dec 2024 at 16:30, Andrey M. Borodin <x4mmm@yandex-team.ru> wrote:
On 2 Dec 2024, at 16:16, Kirill Reshke <reshkekirill@gmail.com> wrote:
I feel like we are ready to mark this as RFC, WDYT?
+1
Done.
--
Regards,
Nazir Bilal Yavuz
Microsoft
On Fri, Nov 29, 2024 at 6:20 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
v4 is attached.
I've started looking at this version. What I don't like about this
structure is that we are forced to increment the position in the array
of BlockInfoRecords in both the callback and the main loop in
autoprewarm_database_main(). There isn't a way around it because we
have to return control to the user when we encounter a new relation
(we can't close the relation and destroy the read stream in the
callback). And in the main loop in autoprewarm_database_main(), we may
fail to open the next relation and then need to keep advancing the
position in the array of BlockInfoRecords.
It isn't just that we have to advance the position in both places --
we also have to have a special case for the first block. All in all,
given that in the current read stream API, a single stream must only
concern itself with a single relation, fork combo, I think there is no
elegant way to deal with this in autoprewarm.
One alternative is to loop through the array of BlockInfoRecords and
get the start and end positions of the blocks in the arary for a
single relation/fork combo. Then we could make the read stream and
pass those two positions and the array as callback_private_data. That
would mean we loop through the whole array twice, but I wonder if the
improvement in clarity is worth it?
Some review feedback on your v4: I don't think we need the
rs_have_free_buffer per_buffer_data. We can just check
have_free_buffers() in both the callback and main loop in
autoprewarm_database_main(). I also think you want a comment about why
first_block is needed. And I think you need to guard the
read_stream_end() after the loop -- what if we never made a read
stream because we errored out for all the block's relations or
something?
- Melanie
On Sat, Mar 29, 2025 at 4:09 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
One alternative is to loop through the array of BlockInfoRecords and
get the start and end positions of the blocks in the arary for a
single relation/fork combo. Then we could make the read stream and
pass those two positions and the array as callback_private_data. That
would mean we loop through the whole array twice, but I wonder if the
improvement in clarity is worth it?
An alternative to this alternative is to somehow include the length of
each "span" (BlockInfoRecords from a single relation/fork) in the
first BlockInfoRecord of that span when building the array. Dunno how
hard that would be, but then you wouldn't have to loop through it
twice.
- Melanie
Hi,
On 2025-03-29 16:09:56 -0400, Melanie Plageman wrote:
I've started looking at this version. What I don't like about this
structure is that we are forced to increment the position in the array
of BlockInfoRecords in both the callback and the main loop in
autoprewarm_database_main(). There isn't a way around it because we
have to return control to the user when we encounter a new relation
(we can't close the relation and destroy the read stream in the
callback). And in the main loop in autoprewarm_database_main(), we may
fail to open the next relation and then need to keep advancing the
position in the array of BlockInfoRecords.It isn't just that we have to advance the position in both places --
we also have to have a special case for the first block. All in all,
given that in the current read stream API, a single stream must only
concern itself with a single relation, fork combo, I think there is no
elegant way to deal with this in autoprewarm.
How about having an iterator function operating on a pointer to iterator state
that's used both by the main loop and the read stream callback? If the
iterator reaches the next relation, it returns InvalidBlockNumber and the main
loop starts the next stream?
Greetings,
Andres Freund
On Sat, Mar 29, 2025 at 4:44 PM Andres Freund <andres@anarazel.de> wrote:
How about having an iterator function operating on a pointer to iterator state
that's used both by the main loop and the read stream callback? If the
iterator reaches the next relation, it returns InvalidBlockNumber and the main
loop starts the next stream?
I don't think that removes the need for the first_block special case.
And we still need to duplicate the logic for detecting the next
database, block, or filenumber in both places. It maybe reduces the
potential for error a little bit. But I don't think it improves the
clarity.
- Melanie
Hi,
Thank you for looking into this!
On Sat, 29 Mar 2025 at 23:10, Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Fri, Nov 29, 2024 at 6:20 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
v4 is attached.
I've started looking at this version. What I don't like about this
structure is that we are forced to increment the position in the array
of BlockInfoRecords in both the callback and the main loop in
autoprewarm_database_main(). There isn't a way around it because we
have to return control to the user when we encounter a new relation
(we can't close the relation and destroy the read stream in the
callback). And in the main loop in autoprewarm_database_main(), we may
fail to open the next relation and then need to keep advancing the
position in the array of BlockInfoRecords.It isn't just that we have to advance the position in both places --
we also have to have a special case for the first block. All in all,
given that in the current read stream API, a single stream must only
concern itself with a single relation, fork combo, I think there is no
elegant way to deal with this in autoprewarm.One alternative is to loop through the array of BlockInfoRecords and
get the start and end positions of the blocks in the arary for a
single relation/fork combo. Then we could make the read stream and
pass those two positions and the array as callback_private_data. That
would mean we loop through the whole array twice, but I wonder if the
improvement in clarity is worth it?
I think this is a good alternative. I will work on this and try to
propose a patch.
Some review feedback on your v4: I don't think we need the
rs_have_free_buffer per_buffer_data. We can just check
have_free_buffers() in both the callback and main loop in
autoprewarm_database_main(). I also think you want a comment about why
first_block is needed. And I think you need to guard the
read_stream_end() after the loop -- what if we never made a read
stream because we errored out for all the block's relations or
something?
All of these are addressed. One extra thing I noticed is we were not
checking if blocknum < number_of_block_in_relation at the first_block
case in the stream callback, this is fixed now.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v5-0001-Optimize-autoprewarm-with-read-streams.patchapplication/octet-stream; name=v5-0001-Optimize-autoprewarm-with-read-streams.patchDownload
From 8b41e2bb5311072bc3b1d9239049930884e48b87 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 11:40:14 +0300
Subject: [PATCH v5 1/2] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 120 +++++++++++++++++++++++++++++--
1 file changed, 115 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..71c4f8a5920 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -422,6 +423,67 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ /*
+ * There may still be queued blocks in the stream even when no free
+ * buffers are available in the buffer pool. This can lead to unnecessary
+ * I/O operations and buffer evictions. One possible solution is to
+ * compare the number of free buffers in the buffer pool with the number
+ * of queued blocks in the stream. However, this approach is considered a
+ * workaround and would add complexity with minimal benefit, as only a few
+ * unnecessary I/O operations and buffer evictions are expected.
+ * Therefore, this solution has not been implemented.
+ */
+ if (!have_free_buffer())
+ return InvalidBlockNumber;
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (cur_blk->blocknum >= p->nblocks_in_fork)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ Assert(p->pos > 0 && p->pos < p->max_pos);
+ old_blk = &(p->block_info[p->pos - 1]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -435,6 +497,8 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,13 +515,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx; pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -470,6 +537,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -506,7 +585,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -518,7 +600,27 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+
+ /*
+ * There is a special case for the first block in the
+ * relation. We can't compare it with the previous block as
+ * there is no previous block yet.
+ */
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+ }
else
nblocks = 0;
}
@@ -532,17 +634,25 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, NULL);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+ /* There are no free buffers left in shared buffers, break the loop. */
+ else if (!have_free_buffer())
+ break;
old_blk = blk;
}
+ if (stream)
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
dsm_detach(seg);
/* Release lock on previous relation. */
--
2.43.0
v5-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchapplication/octet-stream; name=v5-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From ffe876140536c89e2d80df1b9711e0970472b3d8 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Wed, 27 Nov 2024 16:53:58 +0300
Subject: [PATCH v5 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
contrib/pg_prewarm/autoprewarm.c | 4 ++--
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
src/include/storage/buf_internals.h | 1 +
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 71c4f8a5920..42f2bc88fa9 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -516,13 +516,13 @@ autoprewarm_database_main(Datum main_arg)
pos = apw_state->prewarm_start_idx;
p.block_info = block_info;
- p.max_pos = apw_state->prewarm_stop_idx;
+ p.max_pos = Min(apw_state->prewarm_stop_idx, pos + get_number_of_free_buffers());
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- for (; pos < apw_state->prewarm_stop_idx; pos++)
+ for (; pos < p.max_pos; pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c63..9f26e940426 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9327f60c44c..197d2db49f5 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -445,6 +445,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
--
2.43.0
On Sun, Mar 30, 2025 at 10:01 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Some review feedback on your v4: I don't think we need the
rs_have_free_buffer per_buffer_data. We can just check
have_free_buffers() in both the callback and main loop in
autoprewarm_database_main(). I also think you want a comment about why
first_block is needed. And I think you need to guard the
read_stream_end() after the loop -- what if we never made a read
stream because we errored out for all the block's relations or
something?All of these are addressed. One extra thing I noticed is we were not
checking if blocknum < number_of_block_in_relation at the first_block
case in the stream callback, this is fixed now.
I'm wondering why you need to check if have_free_buffer() in the else
branch after getting the buffer from the read stream API. Can't you
just put it back in the for loop condition? Seems like it would have
the same effect.
- for (; pos < apw_state->prewarm_stop_idx; pos++)
+ for (; pos < apw_state->prewarm_stop_idx && have_free_buffer(); pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
@@ -640,9 +640,6 @@ autoprewarm_database_main(Datum main_arg)
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
- /* There are no free buffers left in shared buffers, break the loop. */
- else if (!have_free_buffer())
- break;
Looking at the code some more, I feel stuck on my original point about
incrementing the position in two places.
AFAICT, you are going to end up going through the array twice with this design.
Because you don't set the pos variable in autoprewarm_database_main()
from the p->pos variable which the read stream callback is
incrementing, if the read stream callback increments p->pos it a few
positions yielding those blocks to the read stream machinery to read,
you are then going to iterate over those positions in the array again
in the autoprewarm_database_main() loop.
I think you can get around this by setting pos from p->pos in
autoprewarm_database_main() after read_stream_next_buffer(). Or by
using p->pos in the loop in autoprewarm_database_main() (which is
basically what Andres suggested). I'm not sure, though, if this has
any problems. Like not closing a relation in the right place.
- Melanie
Hi,
Thank you for looking into this!
On Mon, 31 Mar 2025 at 17:42, Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Sun, Mar 30, 2025 at 10:01 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Some review feedback on your v4: I don't think we need the
rs_have_free_buffer per_buffer_data. We can just check
have_free_buffers() in both the callback and main loop in
autoprewarm_database_main(). I also think you want a comment about why
first_block is needed. And I think you need to guard the
read_stream_end() after the loop -- what if we never made a read
stream because we errored out for all the block's relations or
something?All of these are addressed. One extra thing I noticed is we were not
checking if blocknum < number_of_block_in_relation at the first_block
case in the stream callback, this is fixed now.I'm wondering why you need to check if have_free_buffer() in the else
branch after getting the buffer from the read stream API. Can't you
just put it back in the for loop condition? Seems like it would have
the same effect.- for (; pos < apw_state->prewarm_stop_idx; pos++) + for (; pos < apw_state->prewarm_stop_idx && have_free_buffer(); pos++) { BlockInfoRecord *blk = &block_info[pos]; Buffer buf; @@ -640,9 +640,6 @@ autoprewarm_database_main(Datum main_arg) apw_state->prewarmed_blocks++; ReleaseBuffer(buf); } - /* There are no free buffers left in shared buffers, break the loop. */ - else if (!have_free_buffer()) - break;
You are right, done. Attached as v6.
Looking at the code some more, I feel stuck on my original point about
incrementing the position in two places.
AFAICT, you are going to end up going through the array twice with this design.
Because you don't set the pos variable in autoprewarm_database_main()
from the p->pos variable which the read stream callback is
incrementing, if the read stream callback increments p->pos it a few
positions yielding those blocks to the read stream machinery to read,
you are then going to iterate over those positions in the array again
in the autoprewarm_database_main() loop.I think you can get around this by setting pos from p->pos in
autoprewarm_database_main() after read_stream_next_buffer(). Or by
using p->pos in the loop in autoprewarm_database_main() (which is
basically what Andres suggested). I'm not sure, though, if this has
any problems. Like not closing a relation in the right place.
I worked on an alternative approach, I refactored code a bit. It does
not traverse the list two times and I think the code is more suitable
to use read streams now. I simply get how many blocks are processed by
read streams and move the list forward by this number, so the actual
loop skips these blocks. This approach is attached with 'alternative'
prefix.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
alternative_0002-Count-free-buffers-at-the-start-of-the-autoprewarm.patchapplication/octet-stream; name=alternative_0002-Count-free-buffers-at-the-start-of-the-autoprewarm.patchDownload
From 811caa0d543383cc5ec5a4e2d938787cb060cf42 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Mon, 31 Mar 2025 12:21:27 +0300
Subject: [PATCH 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
contrib/pg_prewarm/autoprewarm.c | 7 +++++--
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
src/include/storage/buf_internals.h | 1 +
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 142e9d5359b..e96f67adf7c 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -518,6 +518,7 @@ void
autoprewarm_database_main(Datum main_arg)
{
int pos;
+ int stop_idx;
BlockInfoRecord *block_info;
Relation rel = NULL;
dsm_segment *seg;
@@ -539,11 +540,13 @@ autoprewarm_database_main(Datum main_arg)
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ stop_idx = Min(apw_state->prewarm_stop_idx,
+ pos + get_number_of_free_buffers());
cur_database = block_info[pos].database;
/* Loop until we run out of blocks to prewarm. */
- while (pos < apw_state->prewarm_stop_idx)
+ while (pos < stop_idx)
{
BlockInfoRecord *blk = &block_info[pos];
@@ -616,7 +619,7 @@ autoprewarm_database_main(Datum main_arg)
nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
nblocks_processed = autoprewarm_prewarm_relation(rel,
pos,
- apw_state->prewarm_stop_idx,
+ stop_idx,
nblocks_in_fork,
block_info);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c63..9f26e940426 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9327f60c44c..197d2db49f5 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -445,6 +445,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
--
2.43.0
v6-0001-Optimize-autoprewarm-with-read-streams.patchapplication/octet-stream; name=v6-0001-Optimize-autoprewarm-with-read-streams.patchDownload
From 7e2f610e53cfe097dbc041e09915b8fc5da98c4e Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 5 Nov 2024 11:40:14 +0300
Subject: [PATCH v6 1/2] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 117 +++++++++++++++++++++++++++++--
1 file changed, 112 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..a21d9571dff 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -422,6 +423,67 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ bool first_block;
+ int max_pos;
+ int pos;
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks_in_fork;
+
+};
+
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ BlockInfoRecord *old_blk;
+ BlockInfoRecord *cur_blk;
+
+ /*
+ * There may still be queued blocks in the stream even when no free
+ * buffers are available in the buffer pool. This can lead to unnecessary
+ * I/O operations and buffer evictions. One possible solution is to
+ * compare the number of free buffers in the buffer pool with the number
+ * of queued blocks in the stream. However, this approach is considered a
+ * workaround and would add complexity with minimal benefit, as only a few
+ * unnecessary I/O operations and buffer evictions are expected.
+ * Therefore, this solution has not been implemented.
+ */
+ if (!have_free_buffer())
+ return InvalidBlockNumber;
+
+ if (p->pos == p->max_pos)
+ return InvalidBlockNumber;
+
+ cur_blk = &(p->block_info[p->pos]);
+
+ if (cur_blk->blocknum >= p->nblocks_in_fork)
+ return InvalidBlockNumber;
+
+ if (p->first_block)
+ {
+ p->first_block = false;
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ Assert(p->pos > 0 && p->pos < p->max_pos);
+ old_blk = &(p->block_info[p->pos - 1]);
+
+ if (old_blk->database == cur_blk->database &&
+ old_blk->forknum == cur_blk->forknum &&
+ old_blk->filenumber == cur_blk->filenumber)
+ {
+ p->pos++;
+ return cur_blk->blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -435,6 +497,8 @@ autoprewarm_database_main(Datum main_arg)
BlockNumber nblocks = 0;
BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ ReadStream *stream = NULL;
+ struct apw_read_stream_private p;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,13 +515,16 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ p.block_info = block_info;
+ p.max_pos = apw_state->prewarm_stop_idx;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ for (; pos < apw_state->prewarm_stop_idx && have_free_buffer(); pos++)
{
- BlockInfoRecord *blk = &block_info[pos++];
+ BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
CHECK_FOR_INTERRUPTS();
@@ -470,6 +537,18 @@ autoprewarm_database_main(Datum main_arg)
old_blk->database != 0)
break;
+ /*
+ * If stream needs to be created again, end it before closing the old
+ * relation.
+ */
+ if (stream && (old_blk == NULL ||
+ old_blk->filenumber != blk->filenumber ||
+ old_blk->forknum != blk->forknum))
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
@@ -506,7 +585,10 @@ autoprewarm_database_main(Datum main_arg)
continue;
}
- /* Once per fork, check for fork existence and size. */
+ /*
+ * Once per fork, check for fork existence and size. Then create read
+ * stream if it is suitable.
+ */
if (old_blk == NULL ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
@@ -518,7 +600,27 @@ autoprewarm_database_main(Datum main_arg)
if (blk->forknum > InvalidForkNumber &&
blk->forknum <= MAX_FORKNUM &&
smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+
+ /* Create read stream. */
+ p.nblocks_in_fork = nblocks;
+ p.pos = pos;
+
+ /*
+ * There is a special case for the first block in the
+ * relation. We can't compare it with the previous block as
+ * there is no previous block yet.
+ */
+ p.first_block = true;
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ blk->forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+ }
else
nblocks = 0;
}
@@ -532,8 +634,7 @@ autoprewarm_database_main(Datum main_arg)
}
/* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
+ buf = read_stream_next_buffer(stream, NULL);
if (BufferIsValid(buf))
{
apw_state->prewarmed_blocks++;
@@ -543,6 +644,12 @@ autoprewarm_database_main(Datum main_arg)
old_blk = blk;
}
+ if (stream)
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ }
+
dsm_detach(seg);
/* Release lock on previous relation. */
--
2.43.0
v6-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchapplication/octet-stream; name=v6-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From ac727712a7905ee00a8bc98a31939a7de7bf8493 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Wed, 27 Nov 2024 16:53:58 +0300
Subject: [PATCH v6 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
contrib/pg_prewarm/autoprewarm.c | 4 ++--
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
src/include/storage/buf_internals.h | 1 +
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index a21d9571dff..cc0d45fae2b 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -516,13 +516,13 @@ autoprewarm_database_main(Datum main_arg)
pos = apw_state->prewarm_start_idx;
p.block_info = block_info;
- p.max_pos = apw_state->prewarm_stop_idx;
+ p.max_pos = Min(apw_state->prewarm_stop_idx, pos + get_number_of_free_buffers());
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- for (; pos < apw_state->prewarm_stop_idx && have_free_buffer(); pos++)
+ for (; pos < p.max_pos && have_free_buffer(); pos++)
{
BlockInfoRecord *blk = &block_info[pos];
Buffer buf;
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c63..9f26e940426 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9327f60c44c..197d2db49f5 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -445,6 +445,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
--
2.43.0
alternative_0001-Optimize-autoprewarm-with-read-streams.patchapplication/octet-stream; name=alternative_0001-Optimize-autoprewarm-with-read-streams.patchDownload
From 4a1d93b379966e939c1932a5fdd486d89bae4c7a Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Sat, 29 Mar 2025 20:17:42 +0300
Subject: [PATCH 1/2] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 197 ++++++++++++++++++++++---------
1 file changed, 141 insertions(+), 56 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..142e9d5359b 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -421,6 +422,93 @@ apw_load_buffers(void)
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks;
+ BlockNumber max_pos;
+ BlockNumber pos;
+ Oid cur_database;
+ ForkNumber cur_forknum;
+ RelFileNumber cur_filenumber;
+};
+
+static BlockNumber
+awp_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ BlockInfoRecord cur_blk = p->block_info[p->pos];
+ BlockNumber blocknum = InvalidBlockNumber;
+
+ if (have_free_buffer() &&
+ p->pos < p->max_pos &&
+ cur_blk.blocknum < p->nblocks &&
+ cur_blk.database == p->cur_database &&
+ cur_blk.forknum == p->cur_forknum &&
+ cur_blk.filenumber == p->cur_filenumber)
+ {
+ blocknum = cur_blk.blocknum;
+ }
+
+ (p->pos)++;
+ return blocknum;
+}
+
+/*
+ * Helper function to prewarm buffers in a relation by using read streams.
+ */
+static unsigned int
+autoprewarm_prewarm_relation(Relation rel,
+ BlockNumber pos,
+ BlockNumber max_pos,
+ BlockNumber nblocks_in_fork,
+ BlockInfoRecord *block_info)
+{
+ struct apw_read_stream_private p;
+ ReadStream *stream;
+ unsigned int blocks_done = 0;
+ BlockInfoRecord first_block = block_info[pos];
+
+ p.pos = pos;
+ p.max_pos = max_pos;
+ p.block_info = block_info;
+ p.nblocks = nblocks_in_fork;
+ p.cur_database = first_block.database;
+ p.cur_forknum = first_block.forknum;
+ p.cur_filenumber = first_block.filenumber;
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ first_block.forknum,
+ awp_read_stream_next_block,
+ &p,
+ 0);
+
+ while (true)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buf = read_stream_next_buffer(stream, NULL);
+ if (BufferIsValid(buf))
+ {
+ ReleaseBuffer(buf);
+ ++blocks_done;
+ }
+ else
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ break;
+ }
+ }
+
+ return blocks_done;
+}
/*
* Prewarm all blocks for one database (and possibly also global objects, if
@@ -432,9 +520,10 @@ autoprewarm_database_main(Datum main_arg)
int pos;
BlockInfoRecord *block_info;
Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ Oid cur_database;
+ Oid cur_filenumber = InvalidOid;
+ BlockNumber nblocks_in_fork = InvalidBlockNumber;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,32 +540,44 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
- /*
- * Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
- */
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ cur_database = block_info[pos].database;
+
+ /* Loop until we run out of blocks to prewarm. */
+ while (pos < apw_state->prewarm_stop_idx)
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ BlockInfoRecord *blk = &block_info[pos];
- CHECK_FOR_INTERRUPTS();
+ /* Loop until we run we run out of free buffers. */
+ if (!have_free_buffer())
+ break;
/*
* Quit if we've reached records for another database. If previous
* blocks are of some global objects, then continue pre-warming.
*/
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ if (cur_database != blk->database)
+ {
+ if (cur_database == 0)
+ cur_database = blk->database;
+ else
+ break;
+ }
+
+ /* Check whether blocknum is valid and within fork file size. */
+ while (cur_filenumber == blk->filenumber &&
+ blk->blocknum >= nblocks_in_fork)
+ {
+ /* Move to next forknum. */
+ pos++;
+ continue;
+ }
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
* previously; in that case, there is nothing to close.
*/
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ if (rel && cur_filenumber != blk->filenumber)
{
relation_close(rel, AccessShareLock);
rel = NULL;
@@ -487,70 +588,54 @@ autoprewarm_database_main(Datum main_arg)
* Try to open each new relation, but only once, when we first
* encounter it. If it's been dropped, skip the associated blocks.
*/
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
+ if (!rel && cur_filenumber != blk->filenumber)
{
Oid reloid;
- Assert(rel == NULL);
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
if (OidIsValid(reloid))
rel = try_relation_open(reloid, AccessShareLock);
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
- continue;
- }
-
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
- {
/*
- * smgrexists is not safe for illegal forknum, hence check whether
- * the passed forknum is valid before using it in smgrexists.
+ * Update cur_filenumber although relation may not be opened. If
+ * not updated and if we can't open the relation when the file
+ * number is changed; we will end up unnecessarily trying to open
+ * relation for all the blocks that have the same file number.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ cur_filenumber = blk->filenumber;
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
+ if (!rel)
+ CommitTransactionCommand();
}
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
+ if (rel && smgrexists(RelationGetSmgr(rel), blk->forknum))
{
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
+ unsigned int nblocks_processed;
+
+ nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+ nblocks_processed = autoprewarm_prewarm_relation(rel,
+ pos,
+ apw_state->prewarm_stop_idx,
+ nblocks_in_fork,
+ block_info);
+
+ apw_state->prewarmed_blocks += nblocks_processed;
+ /* Move pos forward by at least one */
+ pos += Max(nblocks_processed, 1);
+ continue;
}
- old_blk = blk;
+ pos++;
}
- dsm_detach(seg);
-
- /* Release lock on previous relation. */
if (rel)
{
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.43.0
On Mon, Mar 31, 2025 at 12:27 PM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
I worked on an alternative approach, I refactored code a bit. It does
not traverse the list two times and I think the code is more suitable
to use read streams now. I simply get how many blocks are processed by
read streams and move the list forward by this number, so the actual
loop skips these blocks. This approach is attached with 'alternative'
prefix.
I am leaning toward the refactored approach because I don't think we
want to go through the array twice and I think it is hard to get it
right with incrementing p.pos in both places and being sure we
correctly close the relation etc.
Looking at your alternative approach, I don't see how the innermost
while loop in autoprewarm_database_main() is correct
/* Check whether blocknum is valid and within fork file size. */
while (cur_filenumber == blk->filenumber &&
blk->blocknum >= nblocks_in_fork)
{
/* Move to next forknum. */
pos++;
continue;
}
Won't this just infinitely loop?
- Melanie
Hi,
On Mon, 31 Mar 2025 at 21:15, Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Mon, Mar 31, 2025 at 12:27 PM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
I worked on an alternative approach, I refactored code a bit. It does
not traverse the list two times and I think the code is more suitable
to use read streams now. I simply get how many blocks are processed by
read streams and move the list forward by this number, so the actual
loop skips these blocks. This approach is attached with 'alternative'
prefix.I am leaning toward the refactored approach because I don't think we
want to go through the array twice and I think it is hard to get it
right with incrementing p.pos in both places and being sure we
correctly close the relation etc.
I liked it more as well.
Looking at your alternative approach, I don't see how the innermost
while loop in autoprewarm_database_main() is correct/* Check whether blocknum is valid and within fork file size. */
while (cur_filenumber == blk->filenumber &&
blk->blocknum >= nblocks_in_fork)
{
/* Move to next forknum. */
pos++;
continue;
}Won't this just infinitely loop?
Oops, you are right. It should be an if statement instead of a while
loop, fixed now. Also, moved 'dsm_detach(seg);' to its previous place
to reduce diff. Attached as v7. Do you think that I should continue to
attach both approaches?
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v7-0001-Optimize-autoprewarm-with-read-streams.patchapplication/octet-stream; name=v7-0001-Optimize-autoprewarm-with-read-streams.patchDownload
From 19abe210241cf48142c2e1b2e3ae5a7ba34921e4 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Sat, 29 Mar 2025 20:17:42 +0300
Subject: [PATCH v7 1/2] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 192 ++++++++++++++++++++++---------
1 file changed, 139 insertions(+), 53 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..68c99664de5 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -421,6 +422,93 @@ apw_load_buffers(void)
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks;
+ BlockNumber max_pos;
+ BlockNumber pos;
+ Oid cur_database;
+ ForkNumber cur_forknum;
+ RelFileNumber cur_filenumber;
+};
+
+static BlockNumber
+awp_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ BlockInfoRecord cur_blk = p->block_info[p->pos];
+ BlockNumber blocknum = InvalidBlockNumber;
+
+ if (have_free_buffer() &&
+ p->pos < p->max_pos &&
+ cur_blk.blocknum < p->nblocks &&
+ cur_blk.database == p->cur_database &&
+ cur_blk.forknum == p->cur_forknum &&
+ cur_blk.filenumber == p->cur_filenumber)
+ {
+ blocknum = cur_blk.blocknum;
+ }
+
+ (p->pos)++;
+ return blocknum;
+}
+
+/*
+ * Helper function to prewarm buffers in a relation by using read streams.
+ */
+static unsigned int
+autoprewarm_prewarm_relation(Relation rel,
+ BlockNumber pos,
+ BlockNumber max_pos,
+ BlockNumber nblocks_in_fork,
+ BlockInfoRecord *block_info)
+{
+ struct apw_read_stream_private p;
+ ReadStream *stream;
+ unsigned int blocks_done = 0;
+ BlockInfoRecord first_block = block_info[pos];
+
+ p.pos = pos;
+ p.max_pos = max_pos;
+ p.block_info = block_info;
+ p.nblocks = nblocks_in_fork;
+ p.cur_database = first_block.database;
+ p.cur_forknum = first_block.forknum;
+ p.cur_filenumber = first_block.filenumber;
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ first_block.forknum,
+ awp_read_stream_next_block,
+ &p,
+ 0);
+
+ while (true)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buf = read_stream_next_buffer(stream, NULL);
+ if (BufferIsValid(buf))
+ {
+ ReleaseBuffer(buf);
+ ++blocks_done;
+ }
+ else
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ break;
+ }
+ }
+
+ return blocks_done;
+}
/*
* Prewarm all blocks for one database (and possibly also global objects, if
@@ -432,9 +520,10 @@ autoprewarm_database_main(Datum main_arg)
int pos;
BlockInfoRecord *block_info;
Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ Oid cur_database;
+ Oid cur_filenumber = InvalidOid;
+ BlockNumber nblocks_in_fork = InvalidBlockNumber;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,32 +540,44 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
- /*
- * Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
- */
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ cur_database = block_info[pos].database;
+
+ /* Loop until we run out of blocks to prewarm. */
+ while (pos < apw_state->prewarm_stop_idx)
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ BlockInfoRecord *blk = &block_info[pos];
- CHECK_FOR_INTERRUPTS();
+ /* Loop until we run we run out of free buffers. */
+ if (!have_free_buffer())
+ break;
/*
* Quit if we've reached records for another database. If previous
* blocks are of some global objects, then continue pre-warming.
*/
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ if (cur_database != blk->database)
+ {
+ if (cur_database == 0)
+ cur_database = blk->database;
+ else
+ break;
+ }
+
+ /* Check whether blocknum is valid and within fork file size. */
+ if (cur_filenumber == blk->filenumber &&
+ blk->blocknum >= nblocks_in_fork)
+ {
+ /* Move to next forknum. */
+ pos++;
+ continue;
+ }
/*
* As soon as we encounter a block of a new relation, close the old
* relation. Note that rel will be NULL if try_relation_open failed
* previously; in that case, there is nothing to close.
*/
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ if (rel && cur_filenumber != blk->filenumber)
{
relation_close(rel, AccessShareLock);
rel = NULL;
@@ -487,60 +588,45 @@ autoprewarm_database_main(Datum main_arg)
* Try to open each new relation, but only once, when we first
* encounter it. If it's been dropped, skip the associated blocks.
*/
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
+ if (!rel && cur_filenumber != blk->filenumber)
{
Oid reloid;
- Assert(rel == NULL);
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
if (OidIsValid(reloid))
rel = try_relation_open(reloid, AccessShareLock);
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
- continue;
- }
-
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
- {
/*
- * smgrexists is not safe for illegal forknum, hence check whether
- * the passed forknum is valid before using it in smgrexists.
+ * Update cur_filenumber although relation may not be opened. If
+ * not updated and if we can't open the relation when the file
+ * number is changed; we will end up unnecessarily trying to open
+ * relation for all the blocks that have the same file number.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ cur_filenumber = blk->filenumber;
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
+ if (!rel)
+ CommitTransactionCommand();
}
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
+ if (rel && smgrexists(RelationGetSmgr(rel), blk->forknum))
{
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
+ unsigned int nblocks_processed;
+
+ nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+ nblocks_processed = autoprewarm_prewarm_relation(rel,
+ pos,
+ apw_state->prewarm_stop_idx,
+ nblocks_in_fork,
+ block_info);
+
+ apw_state->prewarmed_blocks += nblocks_processed;
+ /* Move pos forward by at least one */
+ pos += Max(nblocks_processed, 1);
+ continue;
}
- old_blk = blk;
+ pos++;
}
dsm_detach(seg);
--
2.43.0
v7-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchapplication/octet-stream; name=v7-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From c50d0f17e223e9e85421bc2bc897291383cba74d Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Mon, 31 Mar 2025 12:21:27 +0300
Subject: [PATCH v7 2/2] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
contrib/pg_prewarm/autoprewarm.c | 7 +++++--
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
src/include/storage/buf_internals.h | 1 +
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 68c99664de5..df10e02358d 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -518,6 +518,7 @@ void
autoprewarm_database_main(Datum main_arg)
{
int pos;
+ int stop_idx;
BlockInfoRecord *block_info;
Relation rel = NULL;
dsm_segment *seg;
@@ -539,11 +540,13 @@ autoprewarm_database_main(Datum main_arg)
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ stop_idx = Min(apw_state->prewarm_stop_idx,
+ pos + get_number_of_free_buffers());
cur_database = block_info[pos].database;
/* Loop until we run out of blocks to prewarm. */
- while (pos < apw_state->prewarm_stop_idx)
+ while (pos < stop_idx)
{
BlockInfoRecord *blk = &block_info[pos];
@@ -616,7 +619,7 @@ autoprewarm_database_main(Datum main_arg)
nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
nblocks_processed = autoprewarm_prewarm_relation(rel,
pos,
- apw_state->prewarm_stop_idx,
+ stop_idx,
nblocks_in_fork,
block_info);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c63..9f26e940426 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9327f60c44c..197d2db49f5 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -445,6 +445,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
--
2.43.0
On Mon, Mar 31, 2025 at 2:58 PM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
Do you think that I should continue to
attach both approaches?
No, for now let's try and get this approach to a good place and then
see which one we like.
I think there might be another problem with the code. We only set
cur_database in the loop in autoprewarm_databas_main() when it is 0
if (cur_database != blk->database)
{
if (cur_database == 0)
cur_database = blk->database;
I know that the read stream will return InvalidBlockNumber when we
move onto the next database, but I don't see how we will end up
actually stopping prewarming in autoprewarm_database_main() when we
move on to the next database.
Another thing:
I don't know if it is a correctness issue but in
autoprewarm_database_main(), in this case
if (!rel && cur_filenumber != blk->filenumber)
{
you have removed the Assert(rel == NULL) -- I worry that we will end
up with a rel from a previous iteration after failing to open th enext
rel. I think we want this assert.
And a last thing
I noticed is that the initial values for cur_database, cur_filenumber,
and nblocks_in_fork in autoprewarm_database_main() are all initialized
to different kinds of initial values for different reasons. I'm
thinking if there is a way to make it consistent.
cur_database = block_info[pos].database;
cur_filenumber = InvalidOid;
nblocks_in_fork = InvalidBlockNumber;
cur_database is set to be the same as the first database in the array
so that we won't hit
if (cur_database != blk->database)
on the first block.
However, we make cur_filenumber InvalidOid because for the first block
we want to hit code that forces us to open a new relation
if (!rel && cur_filenumber != blk->filenumber)
And nblocks_in_fork to InvalidBlockNumber so that 1) we don't have to
get the number before starting the loop and 2) so we would move past
BlockInfoRecords with invalid filenumber and invalid blocknumber
if (cur_filenumber == blk->filenumber &&
blk->blocknum >= nblocks_in_fork)
So, I'm just thinking if it would be correct to initialize
cur_database to InvalidOid and to check for that before skipping a
block, or if that doesn't work when the first blocks' database is
InvalidOid.
- Melanie
On Mon, Mar 31, 2025 at 3:27 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
I think there might be another problem with the code. We only set
cur_database in the loop in autoprewarm_databas_main() when it is 0if (cur_database != blk->database)
{
if (cur_database == 0)
cur_database = blk->database;I know that the read stream will return InvalidBlockNumber when we
move onto the next database, but I don't see how we will end up
actually stopping prewarming in autoprewarm_database_main() when we
move on to the next database.
Whoops, this isn't right. It does work. I'm going to draft a version
suggesting slightly different variable naming and a couple comments to
make this more clear.
- Melanie
On Mon, Mar 31, 2025 at 3:45 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
Whoops, this isn't right. It does work. I'm going to draft a version
suggesting slightly different variable naming and a couple comments to
make this more clear.
Okay, so after further study, I think there are multiple issues still
with the code. We could end up comparing a blocknumber to nblocks
calculated from a different fork. To address this, you'll need to keep
track of the last fork_number. At that point, you kind of have to
bring back old_blk -- because that is what we are recreating with
multiple local variables.
But, I think, overall, what we actually want to do is actually be
really explicit about fast-forwarding in the failure cases (when we
want to skip ahead because a relation is invalid or a fork is
invalid). We were trying to use the main loop control and just add
special cases to allow us to do this fast-forwarding. But, I think
instead, we want to just go to a function or loop somewhere and fast
forward through those bad blocks until we get to the next run of
blocks from a different relation or fork.
I've sketched out an idea like this in the attached. I don't think it
is 100% correct. It does pass tests, but I think we might incorrectly
advance pos twice after skipping a run of blocks belonging to a bad
fork or relation -- and thus skip the first good block after some bad
blocks.
It also needs some more refactoring.
maybe instead of having the skip code like this
skip_forknumber:;
while ((blk = next_record(block_info, &i)) != NULL &&
blk->database == database && blk->filenumber == filenumber &&
blk->forknum == forknum);
we make it a function? to avoid the back-to-back while loop conditions
(because of the outer do while loop).
But the explicit looping for skipping the bad blocks and the nested
loops for rel and fork -- I think these are less error prone.
What do you think?
- Melanie
Attachments:
pgsr-autoprewarm-nested-loops.patchtext/x-patch; charset=US-ASCII; name=pgsr-autoprewarm-nested-loops.patchDownload
From 438f13072af060b30485b0dd871c1b26ee503513 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH] pgsr autoprewarm
---
contrib/pg_prewarm/autoprewarm.c | 218 +++++++++++++++++++------------
1 file changed, 131 insertions(+), 87 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..459d36a75a9 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -421,6 +422,63 @@ apw_load_buffers(void)
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ BlockInfoRecord *block_info;
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+};
+
+static BlockNumber
+awp_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+
+ for (int i; (i = p->pos++) < apw_state->prewarm_stop_idx;)
+ {
+ BlockInfoRecord cur_blk = p->block_info[i];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ {
+ p->pos = apw_state->prewarm_stop_idx;
+ return InvalidBlockNumber;
+ }
+
+ if (cur_blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (cur_blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (cur_blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ if (cur_blk.blocknum >= p->nblocks)
+ continue;
+
+ return cur_blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
+static inline BlockInfoRecord *next_record(BlockInfoRecord *block_info, int *pos)
+{
+ int oldpos;
+ if (*pos >= apw_state->prewarm_stop_idx)
+ return NULL;
+
+ oldpos = *pos;
+ (*pos)++;
+ return &block_info[oldpos];
+}
/*
* Prewarm all blocks for one database (and possibly also global objects, if
@@ -429,12 +487,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord *blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -449,108 +506,95 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
- /*
- * Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
- */
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
- {
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ i = apw_state->prewarm_start_idx;
+ blk = &block_info[i];
+ database = blk->database;
- CHECK_FOR_INTERRUPTS();
-
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ do
+ {
+ RelFileNumber filenumber = blk->filenumber;
+ Oid reloid;
+ Relation rel;
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
- {
- relation_close(rel, AccessShareLock);
- rel = NULL;
- CommitTransactionCommand();
- }
+ StartTransactionCommand();
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (!OidIsValid(reloid))
+ goto skip_relation;
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ if ((rel = try_relation_open(reloid, AccessShareLock)) == NULL)
+ goto skip_relation;
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
+ do
{
- old_blk = blk;
- continue;
- }
+ ForkNumber forknum = blk->forknum;
+ BlockNumber nblocks;
+ struct apw_read_stream_private p;
+ ReadStream *stream;
+ Buffer buf;
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
- {
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk->forknum <= InvalidForkNumber)
+ goto skip_forknumber;
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ if (blk->forknum > MAX_FORKNUM)
+ goto skip_forknumber;
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ if (!smgrexists(RelationGetSmgr(rel), blk->forknum))
+ goto skip_forknumber;
- old_blk = blk;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- dsm_detach(seg);
+ p = (struct apw_read_stream_private)
+ {
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ awp_read_stream_next_block,
+ &p,
+ 0);
+
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
+ {
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
+ i = p.pos;
+
+ read_stream_end(stream);
+
+ continue;
+
+ skip_forknumber:;
+ while ((blk = next_record(block_info, &i)) != NULL && blk->database == database && blk->filenumber == filenumber && blk->forknum == forknum);
+ } while ((blk = next_record(block_info, &i)) && blk->database == database && blk->filenumber == filenumber);
+
+ /* Release lock on previous relation. */
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
- }
+
+ continue;
+
+ skip_relation:
+ CommitTransactionCommand();
+ while ((blk = next_record(block_info, &i)) != NULL && blk->database == database && blk->filenumber == filenumber);
+ } while ((blk = next_record(block_info, &i)) && blk->database == database);
+
+ dsm_detach(seg);
}
/*
--
2.34.1
Hi,
On Tue, 1 Apr 2025 at 05:14, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Mon, Mar 31, 2025 at 3:45 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:Whoops, this isn't right. It does work. I'm going to draft a version
suggesting slightly different variable naming and a couple comments to
make this more clear.Okay, so after further study, I think there are multiple issues still
with the code. We could end up comparing a blocknumber to nblocks
calculated from a different fork. To address this, you'll need to keep
track of the last fork_number. At that point, you kind of have to
bring back old_blk -- because that is what we are recreating with
multiple local variables.
Yes, I realized the same.
But, I think, overall, what we actually want to do is actually be
really explicit about fast-forwarding in the failure cases (when we
want to skip ahead because a relation is invalid or a fork is
invalid). We were trying to use the main loop control and just add
special cases to allow us to do this fast-forwarding. But, I think
instead, we want to just go to a function or loop somewhere and fast
forward through those bad blocks until we get to the next run of
blocks from a different relation or fork.I've sketched out an idea like this in the attached. I don't think it
is 100% correct. It does pass tests, but I think we might incorrectly
advance pos twice after skipping a run of blocks belonging to a bad
fork or relation -- and thus skip the first good block after some bad
blocks.
The test actually does not pass, it prewarms 4 of 211 blocks. It
prewarms all 211 blocks in the master.
From 'pg_prewarm/001_basic/log/001_basic_main.log':
'autoprewarm successfully prewarmed 4 of 211 previously-loaded blocks'
It also needs some more refactoring.
maybe instead of having the skip code like this
skip_forknumber:;
while ((blk = next_record(block_info, &i)) != NULL &&
blk->database == database && blk->filenumber == filenumber &&
blk->forknum == forknum);we make it a function? to avoid the back-to-back while loop conditions
(because of the outer do while loop).
+1 for using the functions. I think it is hard to follow / maintain
this with the do-while loops and goto statements.
But the explicit looping for skipping the bad blocks and the nested
loops for rel and fork -- I think these are less error prone.
One question in my mind is, the outermost loop stops when the database
changes, we do not check if it is changed from the database oid = 0.
Handling this might require some structural changes.
What do you think?
I think what you said is right but the current version of the patch
looks more complicated to me. I may be biased because I do not like
do-while loops and goto statements.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Hi,
On Tue, 1 Apr 2025 at 05:14, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Mon, Mar 31, 2025 at 3:45 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:Whoops, this isn't right. It does work. I'm going to draft a version
suggesting slightly different variable naming and a couple comments to
make this more clear.Okay, so after further study, I think there are multiple issues still
with the code. We could end up comparing a blocknumber to nblocks
calculated from a different fork. To address this, you'll need to keep
track of the last fork_number. At that point, you kind of have to
bring back old_blk -- because that is what we are recreating with
multiple local variables.
I am attaching v8, which is an updated version of the v7. I tried to
get rid of these local variables and refactored code to make logic
more straightforward instead of going back and forth.
0001 and 0002 are v8. 0003 is another refactoring attempt to make code
more straightforward. I did not squash 0003 to previous patches as you
might not like it.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
v8-0001-Optimize-autoprewarm-with-read-streams.patchapplication/octet-stream; name=v8-0001-Optimize-autoprewarm-with-read-streams.patchDownload
From 7db3df60698da30bfec4b8cca76342dc6e779f81 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Sat, 29 Mar 2025 20:17:42 +0300
Subject: [PATCH v8 1/3] Optimize autoprewarm with read streams
We've measured 10% performance improvement, and this arranges to benefit
automatically from future optimizations to the read_stream subsystem.
---
contrib/pg_prewarm/autoprewarm.c | 201 +++++++++++++++++++++++--------
1 file changed, 148 insertions(+), 53 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..7154303f0ba 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -421,6 +422,93 @@ apw_load_buffers(void)
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ BlockInfoRecord *block_info;
+ BlockNumber nblocks;
+ BlockNumber max_pos;
+ BlockNumber pos;
+ Oid cur_database;
+ ForkNumber cur_forknum;
+ RelFileNumber cur_filenumber;
+};
+
+static BlockNumber
+awp_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+ BlockInfoRecord cur_blk = p->block_info[p->pos];
+ BlockNumber blocknum = InvalidBlockNumber;
+
+ if (have_free_buffer() &&
+ p->pos < p->max_pos &&
+ cur_blk.blocknum < p->nblocks &&
+ cur_blk.database == p->cur_database &&
+ cur_blk.filenumber == p->cur_filenumber &&
+ cur_blk.forknum == p->cur_forknum)
+ {
+ blocknum = cur_blk.blocknum;
+ }
+
+ (p->pos)++;
+ return blocknum;
+}
+
+/*
+ * Helper function to prewarm buffers in a relation by using read streams.
+ */
+static unsigned int
+autoprewarm_prewarm_relation(Relation rel,
+ BlockNumber pos,
+ BlockNumber max_pos,
+ BlockNumber nblocks_in_fork,
+ BlockInfoRecord *block_info)
+{
+ struct apw_read_stream_private p;
+ ReadStream *stream;
+ unsigned int blocks_done = 0;
+ BlockInfoRecord first_block = block_info[pos];
+
+ p.pos = pos;
+ p.max_pos = max_pos;
+ p.block_info = block_info;
+ p.nblocks = nblocks_in_fork;
+ p.cur_database = first_block.database;
+ p.cur_forknum = first_block.forknum;
+ p.cur_filenumber = first_block.filenumber;
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ first_block.forknum,
+ awp_read_stream_next_block,
+ &p,
+ 0);
+
+ while (true)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buf = read_stream_next_buffer(stream, NULL);
+ if (BufferIsValid(buf))
+ {
+ ReleaseBuffer(buf);
+ ++blocks_done;
+ }
+ else
+ {
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+ break;
+ }
+ }
+
+ return blocks_done;
+}
/*
* Prewarm all blocks for one database (and possibly also global objects, if
@@ -432,9 +520,9 @@ autoprewarm_database_main(Datum main_arg)
int pos;
BlockInfoRecord *block_info;
Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
dsm_segment *seg;
+ Oid cur_database;
+ BlockNumber nblocks_in_fork;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -451,32 +539,34 @@ autoprewarm_database_main(Datum main_arg)
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ cur_database = block_info[pos].database;
+
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
-
- CHECK_FOR_INTERRUPTS();
+ BlockInfoRecord *blk = &block_info[pos];
/*
* Quit if we've reached records for another database. If previous
* blocks are of some global objects, then continue pre-warming.
*/
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ if (cur_database != blk->database)
+ {
+ if (cur_database == 0)
+ cur_database = blk->database;
+ else
+ break;
+ }
/*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
+ * Close the old relation. Note that rel will be NULL if
+ * try_relation_open failed previously; in that case, there is nothing
+ * to close.
*/
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ if (rel)
{
relation_close(rel, AccessShareLock);
rel = NULL;
@@ -487,60 +577,65 @@ autoprewarm_database_main(Datum main_arg)
* Try to open each new relation, but only once, when we first
* encounter it. If it's been dropped, skip the associated blocks.
*/
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
+ if (!rel)
{
Oid reloid;
- Assert(rel == NULL);
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
if (OidIsValid(reloid))
rel = try_relation_open(reloid, AccessShareLock);
if (!rel)
+ {
CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
- continue;
- }
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
- {
- /*
- * smgrexists is not safe for illegal forknum, hence check whether
- * the passed forknum is valid before using it in smgrexists.
- */
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ /* Move to next filenumber. */
+ while (true)
+ {
+ BlockInfoRecord cur_blk = block_info[pos++];
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ if (cur_blk.database != blk->database ||
+ cur_blk.filenumber != blk->filenumber)
+ break;
+ }
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
+ continue;
+ }
+ else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ unsigned int nblocks_processed;
+
+ nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+ nblocks_processed = autoprewarm_prewarm_relation(rel,
+ pos,
+ stop_idx,
+ nblocks_in_fork,
+ block_info);
+
+ apw_state->prewarmed_blocks += nblocks_processed;
+
+ /* Move pos forward by at least one. */
+ pos += Max(nblocks_processed, 1);
+
+ /* Move to next forknum. */
+ while (true)
+ {
+ BlockInfoRecord cur_blk = block_info[pos];
+
+ if (cur_blk.database == blk->database &&
+ cur_blk.filenumber == blk->filenumber &&
+ cur_blk.forknum == blk->forknum)
+ pos++;
+ else
+ break;
+ }
+
+ continue;
+ }
}
- old_blk = blk;
+ pos++;
}
dsm_detach(seg);
--
2.43.0
v8-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchapplication/octet-stream; name=v8-0002-Count-free-buffers-at-the-start-of-the-autoprewar.patchDownload
From 0f640c042458b98c6b7fbd9360561c3623b8d9c7 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Mon, 31 Mar 2025 12:21:27 +0300
Subject: [PATCH v8 2/3] Count free buffers at the start of the autoprewarm
Streamified version of the autoprewarm code may do unnecessary I/O and
buffer evicting. To prevent it at least a little bit, count the number
of free buffers in the buffer pool and queue buffers up to that number
in the callback function of the autoprewarm.
---
contrib/pg_prewarm/autoprewarm.c | 5 ++++-
src/backend/storage/buffer/freelist.c | 17 +++++++++++++++++
src/include/storage/buf_internals.h | 1 +
3 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 7154303f0ba..4f635eaf31c 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -518,6 +518,7 @@ void
autoprewarm_database_main(Datum main_arg)
{
int pos;
+ int stop_idx;
BlockInfoRecord *block_info;
Relation rel = NULL;
dsm_segment *seg;
@@ -538,6 +539,8 @@ autoprewarm_database_main(Datum main_arg)
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
pos = apw_state->prewarm_start_idx;
+ stop_idx = Min(apw_state->prewarm_stop_idx,
+ pos + get_number_of_free_buffers());
cur_database = block_info[pos].database;
@@ -545,7 +548,7 @@ autoprewarm_database_main(Datum main_arg)
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (pos < stop_idx && have_free_buffer())
{
BlockInfoRecord *blk = &block_info[pos];
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c63..9f26e940426 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -180,6 +180,23 @@ have_free_buffer(void)
return false;
}
+/*
+ * get_number_of_free_buffers -- a lockless way to get the number of free
+ * buffers in buffer pool.
+ *
+ * Note that result continuously changes as free buffers are moved out by other
+ * operations.
+ */
+int
+get_number_of_free_buffers(void)
+{
+ /* All the buffers are free. */
+ if (StrategyControl->firstFreeBuffer < 0)
+ return NBuffers;
+ else
+ return StrategyControl->lastFreeBuffer - StrategyControl->firstFreeBuffer;
+}
+
/*
* StrategyGetBuffer
*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 72b36a4af26..cdc9936fccd 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -448,6 +448,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
+extern int get_number_of_free_buffers(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
--
2.43.0
v8-0003-Refactor-code-more.patchapplication/octet-stream; name=v8-0003-Refactor-code-more.patchDownload
From 8136a56cb63ff846b30998d88f211ffff68e0e5b Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 1 Apr 2025 15:38:51 +0300
Subject: [PATCH v8 3/3] Refactor code more
---
contrib/pg_prewarm/autoprewarm.c | 116 ++++++++++++++-----------------
1 file changed, 51 insertions(+), 65 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 4f635eaf31c..1f31d6eaa0e 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -550,6 +550,7 @@ autoprewarm_database_main(Datum main_arg)
*/
while (pos < stop_idx && have_free_buffer())
{
+ Oid reloid;
BlockInfoRecord *blk = &block_info[pos];
/*
@@ -564,91 +565,76 @@ autoprewarm_database_main(Datum main_arg)
break;
}
- /*
- * Close the old relation. Note that rel will be NULL if
- * try_relation_open failed previously; in that case, there is nothing
- * to close.
- */
- if (rel)
- {
- relation_close(rel, AccessShareLock);
- rel = NULL;
- CommitTransactionCommand();
- }
+ /* Try to open new relation. */
+ StartTransactionCommand();
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (OidIsValid(reloid))
+ rel = try_relation_open(reloid, AccessShareLock);
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
if (!rel)
{
- Oid reloid;
-
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ CommitTransactionCommand();
- if (!rel)
+ /* Move to next filenumber. */
+ while (pos < stop_idx && have_free_buffer())
{
- CommitTransactionCommand();
-
- /* Move to next filenumber. */
- while (true)
- {
- BlockInfoRecord cur_blk = block_info[pos++];
+ BlockInfoRecord cur_blk = block_info[pos++];
- if (cur_blk.database != blk->database ||
- cur_blk.filenumber != blk->filenumber)
- break;
- }
-
- continue;
+ if (cur_blk.database != blk->database ||
+ cur_blk.filenumber != blk->filenumber)
+ break;
}
- else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
- {
- unsigned int nblocks_processed;
- nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- nblocks_processed = autoprewarm_prewarm_relation(rel,
- pos,
- stop_idx,
- nblocks_in_fork,
- block_info);
+ continue;
+ }
+ else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ unsigned int nblocks_processed;
- apw_state->prewarmed_blocks += nblocks_processed;
+ nblocks_in_fork = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+ nblocks_processed = autoprewarm_prewarm_relation(rel,
+ pos,
+ stop_idx,
+ nblocks_in_fork,
+ block_info);
- /* Move pos forward by at least one. */
- pos += Max(nblocks_processed, 1);
+ apw_state->prewarmed_blocks += nblocks_processed;
- /* Move to next forknum. */
- while (true)
- {
- BlockInfoRecord cur_blk = block_info[pos];
+ /* Move pos forward by at least one. */
+ pos += Max(nblocks_processed, 1);
- if (cur_blk.database == blk->database &&
- cur_blk.filenumber == blk->filenumber &&
- cur_blk.forknum == blk->forknum)
- pos++;
- else
- break;
- }
+ /* Move to next forknum. */
+ while (pos < stop_idx && have_free_buffer())
+ {
+ BlockInfoRecord cur_blk = block_info[pos];
- continue;
+ if (cur_blk.database == blk->database &&
+ cur_blk.filenumber == blk->filenumber &&
+ cur_blk.forknum == blk->forknum)
+ pos++;
+ else
+ break;
}
+
+ /* Close the relation. */
+ relation_close(rel, AccessShareLock);
+ rel = NULL;
+ CommitTransactionCommand();
+
+ continue;
+ }
+ else
+ {
+ /* Close the relation. */
+ relation_close(rel, AccessShareLock);
+ rel = NULL;
+ CommitTransactionCommand();
}
pos++;
}
dsm_detach(seg);
-
- /* Release lock on previous relation. */
- if (rel)
- {
- relation_close(rel, AccessShareLock);
- CommitTransactionCommand();
- }
}
/*
--
2.43.0
On Tue, Apr 1, 2025 at 7:21 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
On Tue, 1 Apr 2025 at 05:14, Melanie Plageman <melanieplageman@gmail.com> wrote:
+1 for using the functions. I think it is hard to follow / maintain
this with the do-while loops and goto statements.
I'll take a look at your downthread proposal in a bit.
But the attached patch is a new version of what I proposed with the
functions. It's still not totally correct, but I wanted to see what
you thought.
But the explicit looping for skipping the bad blocks and the nested
loops for rel and fork -- I think these are less error prone.One question in my mind is, the outermost loop stops when the database
changes, we do not check if it is changed from the database oid = 0.
Handling this might require some structural changes.
I don't understand why each database has global objects at the
beginning. If there are global objects, they are global to all
databases, so surely the sort function would have put them all at the
beginning? One problem is we need a database connection to prewarm
these, but if the global objects are all at the beginning, then maybe
we can handle those with a special case and not force ourselves to
check for them when trying to load blocks from every database.
- Melanie
Attachments:
pgsr-autoprewarm-loopsv2.patchtext/x-patch; charset=US-ASCII; name=pgsr-autoprewarm-loopsv2.patchDownload
From f1c98580825acc63538372e9f67f6ed2bf68b540 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH] pgsr autoprewarm
---
contrib/pg_prewarm/autoprewarm.c | 244 ++++++++++++++++++++-----------
1 file changed, 160 insertions(+), 84 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..fd3fb228715 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -421,6 +422,96 @@ apw_load_buffers(void)
(errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
apw_state->prewarmed_blocks, num_elements)));
}
+struct apw_read_stream_private
+{
+ BlockInfoRecord *block_info;
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+};
+
+static BlockNumber
+awp_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct apw_read_stream_private *p = callback_private_data;
+
+ for (int i; (i = p->pos++) < apw_state->prewarm_stop_idx;)
+ {
+ BlockInfoRecord cur_blk = p->block_info[i];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ {
+ p->pos = apw_state->prewarm_stop_idx;
+ return InvalidBlockNumber;
+ }
+
+ if (cur_blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (cur_blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (cur_blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ if (cur_blk.blocknum >= p->nblocks)
+ continue;
+
+ return cur_blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
+static int
+next_fork_idx(BlockInfoRecord *block_info, int start, int stop,
+ Oid database, RelFileNumber filenumber, ForkNumber forknum)
+{
+ Assert(block_info[start].forknum == forknum);
+
+ for (int i = start; i < stop; i++)
+ {
+ BlockInfoRecord *blk = &block_info[i];
+
+ if (blk->database != database)
+ return i;
+
+ if (blk->filenumber != filenumber)
+ return i;
+
+ if (blk->forknum != forknum)
+ return i;
+ }
+
+ return stop;
+}
+
+static int
+next_rel_idx(BlockInfoRecord *block_info, int start, int stop,
+ Oid database, RelFileNumber filenumber)
+{
+ Assert(block_info[start].filenumber == filenumber);
+
+ for (int i = start; i < stop; i++)
+ {
+ BlockInfoRecord *blk = &block_info[i];
+
+ if (blk->database != database)
+ return i;
+
+ if (blk->filenumber != filenumber)
+ return i;
+ }
+
+ return stop;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
@@ -429,12 +520,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord *blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -449,108 +539,94 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
- /*
- * Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
- */
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
- {
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ i = apw_state->prewarm_start_idx;
+ blk = &block_info[i];
+ database = blk->database;
- CHECK_FOR_INTERRUPTS();
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database)
+ {
+ RelFileNumber filenumber = blk->filenumber;
+ Oid reloid;
+ Relation rel;
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (!OidIsValid(reloid) ||
+ (OidIsValid(reloid) &&
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL))
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
CommitTransactionCommand();
- }
-
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
-
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
-
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ i = next_rel_idx(block_info, i, apw_state->prewarm_stop_idx,
+ blk->database, blk->filenumber);
+ blk = &block_info[i];
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ blk->filenumber == filenumber)
{
+ ForkNumber forknum = blk->forknum;
+ BlockNumber nblocks;
+ struct apw_read_stream_private p;
+ ReadStream *stream;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk->forknum <= InvalidForkNumber ||
+ blk->forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ i = next_fork_idx(block_info, i, apw_state->prewarm_stop_idx,
+ blk->database, blk->filenumber, blk->forknum);
+ blk = &block_info[i];
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ p = (struct apw_read_stream_private)
+ {
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ awp_read_stream_next_block,
+ &p,
+ 0);
+
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
+ {
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
- old_blk = blk;
- }
+ read_stream_end(stream);
- dsm_detach(seg);
+ i = p.pos;
+ blk = &block_info[i];
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
+ /* Release lock on previous relation. */
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
On Tue, Apr 1, 2025 at 8:50 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
I am attaching v8, which is an updated version of the v7. I tried to
get rid of these local variables and refactored code to make logic
more straightforward instead of going back and forth.0001 and 0002 are v8. 0003 is another refactoring attempt to make code
more straightforward. I did not squash 0003 to previous patches as you
might not like it.
I looked at the code on your github branch that has all three of these
squashed together.
I think our approaches are converging. I like that you are
fast-forwarding to the next filenumber or fork number explicitly when
there is a bad relation or fork. I've changed my version (see newest
one attached) to do the fast-forwarding inline instead of in a
separate function like yours (the function didn't save many LOC and
actually may have added to cognitive overhead).
Compared to my version, I think you avoided one level of loop nesting with your
if (!rel)
else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
else
but for starters, I don't think you can do this:
else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
because you didn't check if you have a legal forknum first
And, I actually kind of prefer the explicitly nested structure
loop through all relations
loop through all forks
loop through all buffers
While in the old structure, I liked your
autoprewarm_prewarm_relation() function, but I think it is nicer
inlined like in my version. It makes the loop through all buffers
explicit too.
I know you mentioned off-list that you don't like the handling of
global objects in my version, but I prefer doing it this way (even
though we have to check for in the loop condition) to having to set
the current database once we reach non-shared objects. It feels too
fiddly. This way seems less error prone. Looking at this version, what
do you think? Could we do it better?
Let me know what you think of this version. I think it is the best of
both our approaches. I've separated it into two commits -- the first
does all the refactoring without using the read stream API and the
second one uses the read stream API.
On another topic, what are the minimal places we need to call
have_free_buffers() (in this version)? I haven't even started looking
at the last patch you've been sending that is about checking the
freelist. I'll have to do that next.
- Melanie
Attachments:
0001-Refactor-autoprewarm_database_main-in-preparation-fo.patchtext/x-patch; charset=US-ASCII; name=0001-Refactor-autoprewarm_database_main-in-preparation-fo.patchDownload
From 2d14f37ea824a6ce3605f53b0a9c5622f08bf6e7 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH 1/2] Refactor autoprewarm_database_main() in preparation for
read stream
TODO: write a commit message
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 166 +++++++++++++++++--------------
1 file changed, 89 insertions(+), 77 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..84072587ea0 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -429,12 +429,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord *blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -449,108 +448,121 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ database = apw_state->database;
+
+ blk = &block_info[i];
/*
* Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
+ * buffers. We'll quit if we've reached records for another database,
+ * however we do want to prewarm any global objects.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx &&
+ (blk->database == database || blk->database == 0) &&
+ have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ RelFileNumber filenumber = blk->filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
+ /* We failed to open the relation, so there is nothing to close. */
CommitTransactionCommand();
- }
-
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if ((blk->database != database && blk->database != 0) ||
+ blk->filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ (blk->database == database || blk->database == 0) &&
+ blk->filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk->forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk->forknum <= InvalidForkNumber ||
+ blk->forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if ((blk->database != database && blk->database != 0) ||
+ blk->filenumber != filenumber ||
+ blk->forknum != forknum)
+ break;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ continue;
+ }
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- old_blk = blk;
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk->blocknum >= nblocks)
+ {
+ blk = &block_info[++i];
+ continue;
+ }
- dsm_detach(seg);
+ /* Prewarm buffer. */
+ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
+ NULL);
+
+ if (BufferIsValid(buf))
+ {
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+
+ blk = &block_info[++i];
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
0002-streaming-read-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=0002-streaming-read-autoprewarm.patchDownload
From ff77f60d670f1c25699e969a48f7bbd5a577a405 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH 2/2] streaming read autoprewarm
TODO: write a commit message
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 95 +++++++++++++++++++++++++++-----
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 82 insertions(+), 14 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 84072587ea0..667aa895b18 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,21 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callaback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ BlockInfoRecord *block_info;
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -422,6 +438,45 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+ BlockInfoRecord blk;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ p->pos = apw_state->prewarm_stop_idx;
+
+ if (p->pos >= apw_state->prewarm_stop_idx)
+ return InvalidBlockNumber;
+
+ blk = p->block_info[p->pos];
+
+ if (blk.database != p->database && blk.database != 0)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= p->nblocks)
+ return InvalidBlockNumber;
+
+ p->pos++;
+ return blk.blocknum;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -467,8 +522,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
@@ -509,6 +562,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk->forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -538,24 +593,36 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
+ p = (struct AutoPrewarmReadStreamData)
{
- blk = &block_info[++i];
- continue;
- }
-
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
-
- if (BufferIsValid(buf))
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+
+ /* Receive our prewarmed buffers */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
- blk = &block_info[++i];
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ i = p.pos;
+ blk = &block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 449bafc123c..a36e372526c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
Hi,
On Wed, 2 Apr 2025 at 01:36, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Tue, Apr 1, 2025 at 8:50 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
I am attaching v8, which is an updated version of the v7. I tried to
get rid of these local variables and refactored code to make logic
more straightforward instead of going back and forth.0001 and 0002 are v8. 0003 is another refactoring attempt to make code
more straightforward. I did not squash 0003 to previous patches as you
might not like it.I looked at the code on your github branch that has all three of these
squashed together.
Thank you!
I think our approaches are converging. I like that you are
fast-forwarding to the next filenumber or fork number explicitly when
there is a bad relation or fork. I've changed my version (see newest
one attached) to do the fast-forwarding inline instead of in a
separate function like yours (the function didn't save many LOC and
actually may have added to cognitive overhead).Compared to my version, I think you avoided one level of loop nesting with your
if (!rel)
else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
elsebut for starters, I don't think you can do this:
else if (smgrexists(RelationGetSmgr(rel), blk->forknum))
because you didn't check if you have a legal forknum first
You are right, I missed that. I think smgrexists() should return NULL
if the forknum is invalid but it is not a topic for this thread.
And, I actually kind of prefer the explicitly nested structure
loop through all relations
loop through all forks
loop through all buffers
I prefer this as well. We know when we opened the relation, so we do
not need to close it in two places like I did.
While in the old structure, I liked your
autoprewarm_prewarm_relation() function, but I think it is nicer
inlined like in my version. It makes the loop through all buffers
explicit too.
Yes, I liked your approach.
I know you mentioned off-list that you don't like the handling of
global objects in my version, but I prefer doing it this way (even
though we have to check for in the loop condition) to having to set
the current database once we reach non-shared objects. It feels too
fiddly. This way seems less error prone. Looking at this version, what
do you think? Could we do it better?
I think there might be a problem with that approach. Let's say that we
are able to open relation when database oid = 0 and filenumber = 18.
Then we are trying to find a valid fork now. We couldn't find a valid
fork immediately, so we continued looping. Then database oid is
changed from 0 to let's say 1 but filenumber remains the same. We are
still in the valid fork loop, so relation remains from the database
oid = 0. Isn't that wrong?
Let me know what you think of this version. I think it is the best of
both our approaches. I've separated it into two commits -- the first
does all the refactoring without using the read stream API and the
second one uses the read stream API.
Some comments,
0001:
ReadBufferExtended() can be called in its own minimal loop, otherwise
we end up doing unnecessary checks for each ReadBufferExtended() call.
This is not a problem when the 0002 is applied.
0002:
We don't skip blocks whose blocknum is more than nblocks_in_fork. We
can add that either to a stream callback like you did before or after
the read_stream_end. I prefer stream callback because of the reason
below [1].
On another topic, what are the minimal places we need to call
have_free_buffers() (in this version)? I haven't even started looking
at the last patch you've been sending that is about checking the
freelist. I'll have to do that next.
I think its current places are good enough. We may add one after the
read_stream_end if we want to handle blk->blocknum >= nblocks_in_fork
after the read stream finishes. If we handle that in the stream
callback then no need to add have_free_buffers() [1].
Other than these comments, I think the current structure looks good.
--
Regards,
Nazir Bilal Yavuz
Microsoft
On Wed, Apr 2, 2025 at 6:26 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
On Wed, 2 Apr 2025 at 01:36, Melanie Plageman <melanieplageman@gmail.com> wrote:
I know you mentioned off-list that you don't like the handling of
global objects in my version, but I prefer doing it this way (even
though we have to check for in the loop condition) to having to set
the current database once we reach non-shared objects. It feels too
fiddly. This way seems less error prone. Looking at this version, what
do you think? Could we do it better?I think there might be a problem with that approach. Let's say that we
are able to open relation when database oid = 0 and filenumber = 18.
Then we are trying to find a valid fork now. We couldn't find a valid
fork immediately, so we continued looping. Then database oid is
changed from 0 to let's say 1 but filenumber remains the same. We are
still in the valid fork loop, so relation remains from the database
oid = 0. Isn't that wrong?
Yep, you are totally right. The code was wrong. We could fix it by
setting current_db to the valid database once we've prewarmed the
global objects, but we need that logic in three places, so that seems
quite undesirable.
In attached v9, I've added a patch to apw_load_buffers() which invokes
autoprewarm_database_main() for the global objects alone but while
connected to the first valid database. It's not the best solution but
I think it is better than having that fiddly logic everywhere about
database 0.
This made me think, I wonder if we could connect to template0 or
template1 to prewarm the global objects. Then we could also prewarm if
only global objects are present (that doesn't seem very important but
it would be a side effect). It might be more clear to connect to
template0/1 instead of the first valid database to prewarm global
objects. I don't know if there is some reason not to do this -- like
maybe bg workers aren't allowed or something?
0001:
ReadBufferExtended() can be called in its own minimal loop, otherwise
we end up doing unnecessary checks for each ReadBufferExtended() call.
This is not a problem when the 0002 is applied.
Could you provide a snippet of example code? If we call
ReadBufferExtended() in a loop, the block won't be changing, so I
don't see how that will help.
0002:
We don't skip blocks whose blocknum is more than nblocks_in_fork. We
can add that either to a stream callback like you did before or after
the read_stream_end. I prefer stream callback because of the reason
below [1].
Yep, I also thought we had to have that logic, but because we sort by
db,rel,fork,blkno, I think blocks with blocknumber >= nblocks_in_fork
will be last and so we just want to move on to the next fork.
On another topic, what are the minimal places we need to call
have_free_buffers() (in this version)? I haven't even started looking
at the last patch you've been sending that is about checking the
freelist. I'll have to do that next.I think its current places are good enough. We may add one after the
read_stream_end if we want to handle blk->blocknum >= nblocks_in_fork
after the read stream finishes. If we handle that in the stream
callback then no need to add have_free_buffers() [1].
As long as we have it in the callback, I don't think that we need
have_free_buffers() after read_stream_end() since it is in the while
loop condition which we will immediately execute after
read_stream_end().
I was also wondering about the other patch in your earlier set that
set stop_idx from get_number_of_free_buffers(). Could you tell me more
about that? What does it do and why is it needed with the read stream
but wasn't needed before?
- Melanie
Attachments:
v9-0001-Autoprewarm-global-objects-separately.patchtext/x-patch; charset=US-ASCII; name=v9-0001-Autoprewarm-global-objects-separately.patchDownload
From b6e8454d632e22ed39db0b343e7be4e9b91900ea Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 2 Apr 2025 09:52:54 -0400
Subject: [PATCH v9 1/3] Autoprewarm global objects separately
Autoprewarm previously prewarmed global objects while prewarming blocks
from objects from the first valid database it encountered. This was
because you can't read in buffers without being connected to a database.
Prewarming global objects and objects from a single database in the
autoprewarm_database_main() function required a special case. Once we
convert autoprewarm to use a read stream, this special case will have to
be duplicated in multiple places.
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.
It is a bit fiddly but seems better than the alternative.
---
contrib/pg_prewarm/autoprewarm.c | 51 ++++++++++++++------------------
1 file changed, 22 insertions(+), 29 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..172654fee25 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -347,44 +347,41 @@ apw_load_buffers(void)
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
- /* Get the info position of the first block of the next database. */
+ /*
+ * Loop through the records and launch a database worker to process
+ * objects in each database. We'll stop at the boundary of each new
+ * database and prewarm those blocks before moving to the next.
+ */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
- * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
- * not belong to this database.
+ * Advance the position to the first BlockInfoRecord that does not
+ * belong to the current database.
*/
- j++;
- while (j < num_elements)
+ for (; j < num_elements; j++)
{
- if (current_db != blkinfo[j].database)
- {
- /*
- * Combine BlockInfoRecords for global objects with those of
- * the database.
- */
- if (current_db != InvalidOid)
- break;
- current_db = blkinfo[j].database;
- }
-
- j++;
+ if (blkinfo[j].database != current_db)
+ break;
}
/*
- * If we reach this point with current_db == InvalidOid, then only
- * BlockInfoRecords belonging to global objects exist. We can't
- * prewarm without a database connection, so just bail out.
+ * We can't prewarm without a database connection, so if all of the
+ * records belong to global objects, we have to bail out.
*/
- if (current_db == InvalidOid)
+ if (current_db == InvalidOid && blkinfo[j].database == InvalidOid)
break;
+ /* Connect to the first valid db to prewarm global objects. */
+ if (current_db == InvalidOid)
+ current_db = blkinfo[j].database;
+
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
+
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
@@ -423,8 +420,8 @@ apw_load_buffers(void)
}
/*
- * Prewarm all blocks for one database (and possibly also global objects, if
- * those got grouped with this database).
+ * Prewarm all blocks for one database or global objects (while connected to a
+ * valid database).
*/
void
autoprewarm_database_main(Datum main_arg)
@@ -462,12 +459,8 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
+ /* Quit if we've reached records for another database. */
+ if (old_blk != NULL && old_blk->database != blk->database)
break;
/*
--
2.34.1
v9-0003-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v9-0003-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From e21c0bf3f16dfdd531d22e7da7167ae9a864c6cd Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v9 3/3] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 105 +++++++++++++++++++++++++++----
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index fb132fbb533..e85931e9c1a 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * `pos` is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -419,6 +442,45 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+ BlockInfoRecord blk;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ p->pos = apw_state->prewarm_stop_idx;
+
+ if (p->pos >= apw_state->prewarm_stop_idx)
+ return InvalidBlockNumber;
+
+ blk = p->block_info[p->pos];
+
+ if (blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= p->nblocks)
+ return InvalidBlockNumber;
+
+ p->pos++;
+ return blk.blocknum;
+}
+
/*
* Prewarm all blocks for one database or global objects (while connected to a
* valid database).
@@ -467,8 +529,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
@@ -509,6 +569,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk->forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -538,24 +600,41 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
+ p = (struct AutoPrewarmReadStreamData)
{
- blk = &block_info[++i];
- continue;
- }
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
-
- if (BufferIsValid(buf))
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
{
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
- blk = &block_info[++i];
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = &block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 449bafc123c..a36e372526c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
v9-0002-Refactor-autoprewarm_database_main-in-preparation.patchtext/x-patch; charset=US-ASCII; name=v9-0002-Refactor-autoprewarm_database_main-in-preparation.patchDownload
From 8c539abb2a3d9d23fb97e28bd722b85453cf2897 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v9 2/3] Refactor autoprewarm_database_main() in preparation
for read stream
The read stream API requires the user to provide a callback that can be
invoked multiple times returning the next block to read.
Autoprewarm uses a sorted array of BlockInfORecords representing each block we
want to try and prewarm from all databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handles this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped,
allowing it to still only advance one spot in the array on each
iteration.
This method no longer works when using the read stream API. The callback
must be able to advance the position in the array multiple times. Even
duplicating the entirety of the logic in autoprewarm_database_main() in
the read stream callback was not enough to make this continue to work.
Instead, change autoprewarm_database_main() to explicitly fast-forward
in the array past the blocks belonging to an invalid relation or
fork. This makes the structure of autoprewarm_database_main()
loop through each valid relation
loop through each valid fork
loop through each valid buffer
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 165 +++++++++++++++++--------------
1 file changed, 92 insertions(+), 73 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 172654fee25..fb132fbb533 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -426,12 +426,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord *blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -446,104 +445,124 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = &block_info[i];
+
+ /*
+ * apw_state->database may differ from blk->database if we are prewarming
+ * blocks from global objects.
+ */
+ database = blk->database;
/*
* Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
+ * buffers. We'll quit if we've reached records for another database,
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ RelFileNumber filenumber = blk->filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
- /* Quit if we've reached records for another database. */
- if (old_blk != NULL && old_blk->database != blk->database)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
+ /* We failed to open the relation, so there is nothing to close. */
CommitTransactionCommand();
- }
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
-
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if (blk->database != database ||
+ blk->filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ blk->filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk->forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk->forknum <= InvalidForkNumber ||
+ blk->forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if (blk->database != database ||
+ blk->filenumber != filenumber ||
+ blk->forknum != forknum)
+ break;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ continue;
+ }
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- old_blk = blk;
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk->blocknum >= nblocks)
+ {
+ blk = &block_info[++i];
+ continue;
+ }
- dsm_detach(seg);
+ /* Prewarm buffer. */
+ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
+ NULL);
+
+ if (BufferIsValid(buf))
+ {
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+
+ blk = &block_info[++i];
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
Hi,
On Wed, 2 Apr 2025 at 18:54, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Wed, Apr 2, 2025 at 6:26 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
On Wed, 2 Apr 2025 at 01:36, Melanie Plageman <melanieplageman@gmail.com> wrote:
I know you mentioned off-list that you don't like the handling of
global objects in my version, but I prefer doing it this way (even
though we have to check for in the loop condition) to having to set
the current database once we reach non-shared objects. It feels too
fiddly. This way seems less error prone. Looking at this version, what
do you think? Could we do it better?I think there might be a problem with that approach. Let's say that we
are able to open relation when database oid = 0 and filenumber = 18.
Then we are trying to find a valid fork now. We couldn't find a valid
fork immediately, so we continued looping. Then database oid is
changed from 0 to let's say 1 but filenumber remains the same. We are
still in the valid fork loop, so relation remains from the database
oid = 0. Isn't that wrong?Yep, you are totally right. The code was wrong. We could fix it by
setting current_db to the valid database once we've prewarmed the
global objects, but we need that logic in three places, so that seems
quite undesirable.In attached v9, I've added a patch to apw_load_buffers() which invokes
autoprewarm_database_main() for the global objects alone but while
connected to the first valid database. It's not the best solution but
I think it is better than having that fiddly logic everywhere about
database 0.
I liked this. I think it is better compared to handling global objects
in autoprewarm_database_main().
This made me think, I wonder if we could connect to template0 or
template1 to prewarm the global objects. Then we could also prewarm if
only global objects are present (that doesn't seem very important but
it would be a side effect). It might be more clear to connect to
template0/1 instead of the first valid database to prewarm global
objects. I don't know if there is some reason not to do this -- like
maybe bg workers aren't allowed or something?
I am not sure but I think the current implementation is good enough.
0001:
ReadBufferExtended() can be called in its own minimal loop, otherwise
we end up doing unnecessary checks for each ReadBufferExtended() call.
This is not a problem when the 0002 is applied.Could you provide a snippet of example code? If we call
ReadBufferExtended() in a loop, the block won't be changing, so I
don't see how that will help.
I don't have an example code right now. But what I mean is we may call
ReadBufferExtended() in a loop for the blocks in the same fork. We
don't need to call smgrexists() and RelationGetNumberOfBlocksInFork()
for each block, we will call these for each fork not for each block.
However, like I said before, this is not important when the read
stream code is applied.
0002:
We don't skip blocks whose blocknum is more than nblocks_in_fork. We
can add that either to a stream callback like you did before or after
the read_stream_end. I prefer stream callback because of the reason
below [1].Yep, I also thought we had to have that logic, but because we sort by
db,rel,fork,blkno, I think blocks with blocknumber >= nblocks_in_fork
will be last and so we just want to move on to the next fork.
I agree that they will be the last, but won't we end up creating a
read stream object for each block?
On another topic, what are the minimal places we need to call
have_free_buffers() (in this version)? I haven't even started looking
at the last patch you've been sending that is about checking the
freelist. I'll have to do that next.I think its current places are good enough. We may add one after the
read_stream_end if we want to handle blk->blocknum >= nblocks_in_fork
after the read stream finishes. If we handle that in the stream
callback then no need to add have_free_buffers() [1].As long as we have it in the callback, I don't think that we need
have_free_buffers() after read_stream_end() since it is in the while
loop condition which we will immediately execute after
read_stream_end().
This is okay for me.
I was also wondering about the other patch in your earlier set that
set stop_idx from get_number_of_free_buffers(). Could you tell me more
about that? What does it do and why is it needed with the read stream
but wasn't needed before?
In the read stream code, we use callbacks to create bigger I/Os. These
I/Os aren't processed until the io_combine_limit or we hit not
sequential blocknum. In other words, when the have_free_buffer()
function returns false in the callback; there are still queued blocks
in the stream, although there are no free buffers in the buffer pool.
We can end up creating I/Os bigger than free buffers in the shared
buffers.
To solve that a bit, we try to get a number of free buffers in the
shared buffers. So, we try to minimize the problem above by using the
actual free buffer count. That optimization has problems like if other
processes fill shared buffers at the same time while the read stream
is running, then this optimization will not work well.
--
Regards,
Nazir Bilal Yavuz
Microsoft
On Wed, Apr 2, 2025 at 1:20 PM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
On Wed, 2 Apr 2025 at 18:54, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Wed, Apr 2, 2025 at 6:26 AM Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:
I don't have an example code right now. But what I mean is we may call
ReadBufferExtended() in a loop for the blocks in the same fork. We
don't need to call smgrexists() and RelationGetNumberOfBlocksInFork()
for each block, we will call these for each fork not for each block.
However, like I said before, this is not important when the read
stream code is applied.
Ah, you are so right. That was totally messed up in the last version.
I've fixed it in attached v10. I think having it correct in the 0002
patch makes it easier to understand how the read stream callback is
replacing it.
We don't skip blocks whose blocknum is more than nblocks_in_fork. We
can add that either to a stream callback like you did before or after
the read_stream_end. I prefer stream callback because of the reason
below [1].Yep, I also thought we had to have that logic, but because we sort by
db,rel,fork,blkno, I think blocks with blocknumber >= nblocks_in_fork
will be last and so we just want to move on to the next fork.I agree that they will be the last, but won't we end up creating a
read stream object for each block?
Ah, yes, you are right. That would have been really broken. I think I
fixed it. See attached. Now we'll only do that for the first block if
it is invalid (which is probably okay IMO).
I was also wondering about the other patch in your earlier set that
set stop_idx from get_number_of_free_buffers(). Could you tell me more
about that? What does it do and why is it needed with the read stream
but wasn't needed before?In the read stream code, we use callbacks to create bigger I/Os. These
I/Os aren't processed until the io_combine_limit or we hit not
sequential blocknum. In other words, when the have_free_buffer()
function returns false in the callback; there are still queued blocks
in the stream, although there are no free buffers in the buffer pool.
We can end up creating I/Os bigger than free buffers in the shared
buffers.To solve that a bit, we try to get a number of free buffers in the
shared buffers. So, we try to minimize the problem above by using the
actual free buffer count. That optimization has problems like if other
processes fill shared buffers at the same time while the read stream
is running, then this optimization will not work well.
Hmm. Yea, I do find it confusing that it will get so easily out of
date. Let's circle back to this after getting the other patches to a
good place (but before committing all of this).
- Melanie
Attachments:
v10-0002-Refactor-autoprewarm_database_main-in-preparatio.patchtext/x-patch; charset=US-ASCII; name=v10-0002-Refactor-autoprewarm_database_main-in-preparatio.patchDownload
From 96f754d1c65d95cefaf3568d1f1ea0f8f0a0b69c Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v10 2/3] Refactor autoprewarm_database_main() in preparation
for read stream
Autoprewarm prewarms blocks from a dump file representing the contents
of shared buffers last time it was dumped. It uses a sorted array of
BlockInfoRecords, each representing a block from one of the cluster's
databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handled this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped.
This method won't work with the read stream API. A read stream can only
be created for a single relation and fork combination. The callback has
to be able to advance the position in the array to allow for reading
ahead additional blocks, however the callback cannot try to open another
relation or close the current relation. So, the main loop in
autoprewarm_database_main() must also advance the position in the array
of BlockInfoRecords.
To make it compatible with the read stream API, change
autoprewarm_database_main() to explicitly fast-forward in the array past
the blocks belonging to an invalid relation or fork.
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 174 ++++++++++++++++++-------------
1 file changed, 101 insertions(+), 73 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 172654fee25..0722d7bf457 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -426,12 +426,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord *blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -446,104 +445,133 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = &block_info[i];
+
+ /*
+ * apw_state->database may differ from blk->database if we are prewarming
+ * blocks from global objects.
+ */
+ database = blk->database;
/*
* Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
+ * buffers. We'll quit if we've reached records for another database,
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ RelFileNumber filenumber = blk->filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
- /* Quit if we've reached records for another database. */
- if (old_blk != NULL && old_blk->database != blk->database)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
+ /* We failed to open the relation, so there is nothing to close. */
CommitTransactionCommand();
- }
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
-
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if (blk->database != database ||
+ blk->filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ blk->filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk->forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk->forknum <= InvalidForkNumber ||
+ blk->forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk->forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = &block_info[i];
+ if (blk->database != database ||
+ blk->filenumber != filenumber ||
+ blk->forknum != forknum)
+ break;
+ }
+
+ /* Time to check if this newfound fork is valid */
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk->blocknum >= nblocks)
+ {
+ blk = &block_info[++i];
+ continue;
+ }
- old_blk = blk;
- }
+ /* Prewarm buffers. */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk->database == database &&
+ blk->filenumber == filenumber &&
+ blk->forknum == forknum &&
+ have_free_buffer())
+ {
+ CHECK_FOR_INTERRUPTS();
- dsm_detach(seg);
+ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
+ NULL);
+
+ blk = &block_info[++i];
+ if (!BufferIsValid(buf))
+ break;
+
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
v10-0001-Autoprewarm-global-objects-separately.patchtext/x-patch; charset=US-ASCII; name=v10-0001-Autoprewarm-global-objects-separately.patchDownload
From a945fd408a6c3e073394df5332790a6a0fcf5efe Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 2 Apr 2025 09:52:54 -0400
Subject: [PATCH v10 1/3] Autoprewarm global objects separately
Autoprewarm previously prewarmed global objects while prewarming blocks
from objects from the first valid database it encountered. This was
because you can't read in buffers without being connected to a database.
Prewarming global objects and objects from a single database in the
autoprewarm_database_main() function required a special case. Once we
convert autoprewarm to use a read stream, this special case will have to
be duplicated in multiple places.
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.
It is a bit fiddly but seems better than the alternative.
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/CAN55FZ0TBBmrJ2vtMQ9rEk-NTL2BWQzavVp%3DiRLOUskm%2BzvNNw%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 51 ++++++++++++++------------------
1 file changed, 22 insertions(+), 29 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..172654fee25 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -347,44 +347,41 @@ apw_load_buffers(void)
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
- /* Get the info position of the first block of the next database. */
+ /*
+ * Loop through the records and launch a database worker to process
+ * objects in each database. We'll stop at the boundary of each new
+ * database and prewarm those blocks before moving to the next.
+ */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
- * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
- * not belong to this database.
+ * Advance the position to the first BlockInfoRecord that does not
+ * belong to the current database.
*/
- j++;
- while (j < num_elements)
+ for (; j < num_elements; j++)
{
- if (current_db != blkinfo[j].database)
- {
- /*
- * Combine BlockInfoRecords for global objects with those of
- * the database.
- */
- if (current_db != InvalidOid)
- break;
- current_db = blkinfo[j].database;
- }
-
- j++;
+ if (blkinfo[j].database != current_db)
+ break;
}
/*
- * If we reach this point with current_db == InvalidOid, then only
- * BlockInfoRecords belonging to global objects exist. We can't
- * prewarm without a database connection, so just bail out.
+ * We can't prewarm without a database connection, so if all of the
+ * records belong to global objects, we have to bail out.
*/
- if (current_db == InvalidOid)
+ if (current_db == InvalidOid && blkinfo[j].database == InvalidOid)
break;
+ /* Connect to the first valid db to prewarm global objects. */
+ if (current_db == InvalidOid)
+ current_db = blkinfo[j].database;
+
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
+
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
@@ -423,8 +420,8 @@ apw_load_buffers(void)
}
/*
- * Prewarm all blocks for one database (and possibly also global objects, if
- * those got grouped with this database).
+ * Prewarm all blocks for one database or global objects (while connected to a
+ * valid database).
*/
void
autoprewarm_database_main(Datum main_arg)
@@ -462,12 +459,8 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
+ /* Quit if we've reached records for another database. */
+ if (old_blk != NULL && old_blk->database != blk->database)
break;
/*
--
2.34.1
v10-0003-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v10-0003-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From 3221de7b11f0b1df281b5c9076a639f0731d83bf Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v10 3/3] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 125 +++++++++++++++++++++++++------
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 104 insertions(+), 22 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 0722d7bf457..5d21314120f 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * `pos` is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -419,6 +442,55 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ p->pos = apw_state->prewarm_stop_idx;
+
+ if (p->pos >= apw_state->prewarm_stop_idx)
+ return InvalidBlockNumber;
+
+ if (blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ p->pos++;
+
+ /*
+ * Check whether blocknum is valid and within fork file size.
+ * Fast-forward through any invalid blocks. We want `p->pos` to
+ * reflect the location of the next relation or fork before ending the
+ * stream.
+ */
+ if (blk.blocknum >= p->nblocks)
+ continue;
+
+ return blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database or global objects (while connected to a
* valid database).
@@ -467,8 +539,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
@@ -509,6 +579,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk->forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -539,32 +611,41 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
+ p = (struct AutoPrewarmReadStreamData)
{
- blk = &block_info[++i];
- continue;
- }
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
- /* Prewarm buffers. */
- while (i < apw_state->prewarm_stop_idx &&
- blk->database == database &&
- blk->filenumber == filenumber &&
- blk->forknum == forknum &&
- have_free_buffer())
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
{
- CHECK_FOR_INTERRUPTS();
-
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
-
- blk = &block_info[++i];
- if (!BufferIsValid(buf))
- break;
-
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = &block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8f28d8ff28e..5ac290fae78 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
On Wed, Apr 2, 2025 at 3:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
attached v10
Attached v11 has an assortment of cosmetic updates.
- Melanie
Attachments:
v11-0002-Refactor-autoprewarm_database_main-in-preparatio.patchtext/x-patch; charset=US-ASCII; name=v11-0002-Refactor-autoprewarm_database_main-in-preparatio.patchDownload
From 3b1657a0ba51299fa0384a0032759f3661a2baca Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v11 2/3] Refactor autoprewarm_database_main() in preparation
for read stream
Autoprewarm prewarms blocks from a dump file representing the contents
of shared buffers last time it was dumped. It uses a sorted array of
BlockInfoRecords, each representing a block from one of the cluster's
databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handled this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped.
This method won't work with the read stream API. A read stream can only
be created for a single relation and fork combination. The callback has
to be able to advance the position in the array to allow for reading
ahead additional blocks, however the callback cannot try to open another
relation or close the current relation. So, the main loop in
autoprewarm_database_main() must also advance the position in the array
of BlockInfoRecords.
To make it compatible with the read stream API, change
autoprewarm_database_main() to explicitly fast-forward in the array past
the blocks belonging to an invalid relation or fork.
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 174 ++++++++++++++++++-------------
1 file changed, 101 insertions(+), 73 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 172654fee25..b02859d8ec3 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -426,12 +426,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -446,104 +445,133 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = block_info[i];
+
+ /*
+ * apw_state->database may differ from blk.database if we are prewarming
+ * blocks from global objects.
+ */
+ database = blk.database;
/*
* Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
+ * buffers. We'll quit if we've reached records for another database,
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ RelFileNumber filenumber = blk.filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
- /* Quit if we've reached records for another database. */
- if (old_blk != NULL && old_blk->database != blk->database)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
+ /* We failed to open the relation, so there is nothing to close. */
CommitTransactionCommand();
- }
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
-
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.database != database ||
+ blk.filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ blk.filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk.forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk.forknum <= InvalidForkNumber ||
+ blk.forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk.forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.database != database ||
+ blk.filenumber != filenumber ||
+ blk.forknum != forknum)
+ break;
+ }
+
+ /* Time to check if this newfound fork is valid */
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= nblocks)
+ {
+ blk = block_info[++i];
+ continue;
+ }
- old_blk = blk;
- }
+ /* Prewarm buffers. */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ blk.filenumber == filenumber &&
+ blk.forknum == forknum &&
+ have_free_buffer())
+ {
+ CHECK_FOR_INTERRUPTS();
- dsm_detach(seg);
+ buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
+ NULL);
+
+ blk = block_info[++i];
+ if (!BufferIsValid(buf))
+ break;
+
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
v11-0001-Autoprewarm-global-objects-separately.patchtext/x-patch; charset=US-ASCII; name=v11-0001-Autoprewarm-global-objects-separately.patchDownload
From dbfb2fe6b9f319efd951da93ea9c1f350bac8815 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 2 Apr 2025 09:52:54 -0400
Subject: [PATCH v11 1/3] Autoprewarm global objects separately
Autoprewarm previously prewarmed global objects while prewarming blocks
from objects from the first valid database it encountered. This was
because you can't read in buffers without being connected to a database.
Prewarming global objects and objects from a single database in the
autoprewarm_database_main() function required a special case. Once we
convert autoprewarm to use a read stream, this special case will have to
be duplicated in multiple places.
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.
It is a bit fiddly but seems better than the alternative.
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/CAN55FZ0TBBmrJ2vtMQ9rEk-NTL2BWQzavVp%3DiRLOUskm%2BzvNNw%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 51 ++++++++++++++------------------
1 file changed, 22 insertions(+), 29 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..172654fee25 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -347,44 +347,41 @@ apw_load_buffers(void)
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
- /* Get the info position of the first block of the next database. */
+ /*
+ * Loop through the records and launch a database worker to process
+ * objects in each database. We'll stop at the boundary of each new
+ * database and prewarm those blocks before moving to the next.
+ */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
- * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
- * not belong to this database.
+ * Advance the position to the first BlockInfoRecord that does not
+ * belong to the current database.
*/
- j++;
- while (j < num_elements)
+ for (; j < num_elements; j++)
{
- if (current_db != blkinfo[j].database)
- {
- /*
- * Combine BlockInfoRecords for global objects with those of
- * the database.
- */
- if (current_db != InvalidOid)
- break;
- current_db = blkinfo[j].database;
- }
-
- j++;
+ if (blkinfo[j].database != current_db)
+ break;
}
/*
- * If we reach this point with current_db == InvalidOid, then only
- * BlockInfoRecords belonging to global objects exist. We can't
- * prewarm without a database connection, so just bail out.
+ * We can't prewarm without a database connection, so if all of the
+ * records belong to global objects, we have to bail out.
*/
- if (current_db == InvalidOid)
+ if (current_db == InvalidOid && blkinfo[j].database == InvalidOid)
break;
+ /* Connect to the first valid db to prewarm global objects. */
+ if (current_db == InvalidOid)
+ current_db = blkinfo[j].database;
+
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
+
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
@@ -423,8 +420,8 @@ apw_load_buffers(void)
}
/*
- * Prewarm all blocks for one database (and possibly also global objects, if
- * those got grouped with this database).
+ * Prewarm all blocks for one database or global objects (while connected to a
+ * valid database).
*/
void
autoprewarm_database_main(Datum main_arg)
@@ -462,12 +459,8 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
+ /* Quit if we've reached records for another database. */
+ if (old_blk != NULL && old_blk->database != blk->database)
break;
/*
--
2.34.1
v11-0003-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v11-0003-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From bb812c2d92fe120171d20a284ce21546c5836f29 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v11 3/3] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 125 +++++++++++++++++++++++++------
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 104 insertions(+), 22 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index b02859d8ec3..4b35a19eec1 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * `pos` is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -419,6 +442,55 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ p->pos = apw_state->prewarm_stop_idx;
+
+ if (p->pos >= apw_state->prewarm_stop_idx)
+ return InvalidBlockNumber;
+
+ if (blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ p->pos++;
+
+ /*
+ * Check whether blocknum is valid and within fork file size.
+ * Fast-forward through any invalid blocks. We want `p->pos` to
+ * reflect the location of the next relation or fork before ending the
+ * stream.
+ */
+ if (blk.blocknum >= p->nblocks)
+ continue;
+
+ return blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database or global objects (while connected to a
* valid database).
@@ -467,8 +539,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
@@ -509,6 +579,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk.forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -539,32 +611,41 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Check whether blocknum is valid and within fork file size. */
- if (blk.blocknum >= nblocks)
+ p = (struct AutoPrewarmReadStreamData)
{
- blk = block_info[++i];
- continue;
- }
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
- /* Prewarm buffers. */
- while (i < apw_state->prewarm_stop_idx &&
- blk.database == database &&
- blk.filenumber == filenumber &&
- blk.forknum == forknum &&
- have_free_buffer())
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
{
- CHECK_FOR_INTERRUPTS();
-
- buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
- NULL);
-
- blk = block_info[++i];
- if (!BufferIsValid(buf))
- break;
-
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8f28d8ff28e..5ac290fae78 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
On Wed, Apr 2, 2025 at 8:25 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:
On Wed, Apr 2, 2025 at 3:34 PM Melanie Plageman
<melanieplageman@gmail.com> wrote:attached v10
Attached v11 has an assortment of cosmetic updates.
Attached v12 fixes a bug Bilal found off-list in 0002 related to
handling invalid blocks.
- Melanie
Attachments:
v12-0003-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v12-0003-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From 7fde68340b0a0b80e4e6d2de5c1771a4cd5c7bfe Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v12 3/3] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 127 +++++++++++++++++++++++++------
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 105 insertions(+), 23 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index fe79c8c71bf..4b35a19eec1 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * `pos` is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid database;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -419,6 +442,55 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ p->pos = apw_state->prewarm_stop_idx;
+
+ if (p->pos >= apw_state->prewarm_stop_idx)
+ return InvalidBlockNumber;
+
+ if (blk.database != p->database)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ p->pos++;
+
+ /*
+ * Check whether blocknum is valid and within fork file size.
+ * Fast-forward through any invalid blocks. We want `p->pos` to
+ * reflect the location of the next relation or fork before ending the
+ * stream.
+ */
+ if (blk.blocknum >= p->nblocks)
+ continue;
+
+ return blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database or global objects (while connected to a
* valid database).
@@ -467,8 +539,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
StartTransactionCommand();
reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
@@ -509,6 +579,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk.forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -539,32 +611,41 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffers. */
- while (i < apw_state->prewarm_stop_idx &&
- blk.database == database &&
- blk.filenumber == filenumber &&
- blk.forknum == forknum &&
- have_free_buffer())
+ p = (struct AutoPrewarmReadStreamData)
{
- CHECK_FOR_INTERRUPTS();
-
- /* Check whether blocknum is valid and within fork file size. */
- if (blk.blocknum >= nblocks)
- {
- blk = block_info[++i];
- continue;
- }
-
- buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
- NULL);
-
- blk = block_info[++i];
- if (!BufferIsValid(buf))
- break;
+ .block_info = block_info,
+ .pos = i,
+ .database = database,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
+ {
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8f28d8ff28e..5ac290fae78 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
v12-0001-Autoprewarm-global-objects-separately.patchtext/x-patch; charset=US-ASCII; name=v12-0001-Autoprewarm-global-objects-separately.patchDownload
From dbfb2fe6b9f319efd951da93ea9c1f350bac8815 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 2 Apr 2025 09:52:54 -0400
Subject: [PATCH v12 1/3] Autoprewarm global objects separately
Autoprewarm previously prewarmed global objects while prewarming blocks
from objects from the first valid database it encountered. This was
because you can't read in buffers without being connected to a database.
Prewarming global objects and objects from a single database in the
autoprewarm_database_main() function required a special case. Once we
convert autoprewarm to use a read stream, this special case will have to
be duplicated in multiple places.
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.
It is a bit fiddly but seems better than the alternative.
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/CAN55FZ0TBBmrJ2vtMQ9rEk-NTL2BWQzavVp%3DiRLOUskm%2BzvNNw%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 51 ++++++++++++++------------------
1 file changed, 22 insertions(+), 29 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..172654fee25 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -347,44 +347,41 @@ apw_load_buffers(void)
apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
apw_state->prewarmed_blocks = 0;
- /* Get the info position of the first block of the next database. */
+ /*
+ * Loop through the records and launch a database worker to process
+ * objects in each database. We'll stop at the boundary of each new
+ * database and prewarm those blocks before moving to the next.
+ */
while (apw_state->prewarm_start_idx < num_elements)
{
int j = apw_state->prewarm_start_idx;
Oid current_db = blkinfo[j].database;
/*
- * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
- * not belong to this database.
+ * Advance the position to the first BlockInfoRecord that does not
+ * belong to the current database.
*/
- j++;
- while (j < num_elements)
+ for (; j < num_elements; j++)
{
- if (current_db != blkinfo[j].database)
- {
- /*
- * Combine BlockInfoRecords for global objects with those of
- * the database.
- */
- if (current_db != InvalidOid)
- break;
- current_db = blkinfo[j].database;
- }
-
- j++;
+ if (blkinfo[j].database != current_db)
+ break;
}
/*
- * If we reach this point with current_db == InvalidOid, then only
- * BlockInfoRecords belonging to global objects exist. We can't
- * prewarm without a database connection, so just bail out.
+ * We can't prewarm without a database connection, so if all of the
+ * records belong to global objects, we have to bail out.
*/
- if (current_db == InvalidOid)
+ if (current_db == InvalidOid && blkinfo[j].database == InvalidOid)
break;
+ /* Connect to the first valid db to prewarm global objects. */
+ if (current_db == InvalidOid)
+ current_db = blkinfo[j].database;
+
/* Configure stop point and database for next per-database worker. */
apw_state->prewarm_stop_idx = j;
apw_state->database = current_db;
+
Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
/* If we've run out of free buffers, don't launch another worker. */
@@ -423,8 +420,8 @@ apw_load_buffers(void)
}
/*
- * Prewarm all blocks for one database (and possibly also global objects, if
- * those got grouped with this database).
+ * Prewarm all blocks for one database or global objects (while connected to a
+ * valid database).
*/
void
autoprewarm_database_main(Datum main_arg)
@@ -462,12 +459,8 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
- /*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
- */
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
+ /* Quit if we've reached records for another database. */
+ if (old_blk != NULL && old_blk->database != blk->database)
break;
/*
--
2.34.1
v12-0002-Refactor-autoprewarm_database_main-in-preparatio.patchtext/x-patch; charset=US-ASCII; name=v12-0002-Refactor-autoprewarm_database_main-in-preparatio.patchDownload
From 322e4eb8f54d3f62bdac9a5ca5a6473aa4a53531 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v12 2/3] Refactor autoprewarm_database_main() in preparation
for read stream
Autoprewarm prewarms blocks from a dump file representing the contents
of shared buffers last time it was dumped. It uses a sorted array of
BlockInfoRecords, each representing a block from one of the cluster's
databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handled this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped.
This method won't work with the read stream API. A read stream can only
be created for a single relation and fork combination. The callback has
to be able to advance the position in the array to allow for reading
ahead additional blocks, however the callback cannot try to open another
relation or close the current relation. So, the main loop in
autoprewarm_database_main() must also advance the position in the array
of BlockInfoRecords.
To make it compatible with the read stream API, change
autoprewarm_database_main() to explicitly fast-forward in the array past
the blocks belonging to an invalid relation or fork.
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 174 ++++++++++++++++++-------------
1 file changed, 101 insertions(+), 73 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 172654fee25..fe79c8c71bf 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -426,12 +426,11 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord blk;
dsm_segment *seg;
+ Oid database;
/* Establish signal handlers; once that's done, unblock signals. */
pqsignal(SIGTERM, die);
@@ -446,104 +445,133 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = block_info[i];
+
+ /*
+ * apw_state->database may differ from blk.database if we are prewarming
+ * blocks from global objects.
+ */
+ database = blk.database;
/*
* Loop until we run out of blocks to prewarm or until we run out of free
- * buffers.
+ * buffers. We'll quit if we've reached records for another database,
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ RelFileNumber filenumber = blk.filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
- /* Quit if we've reached records for another database. */
- if (old_blk != NULL && old_blk->database != blk->database)
- break;
+ StartTransactionCommand();
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
- */
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
- rel != NULL)
+ reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- relation_close(rel, AccessShareLock);
- rel = NULL;
+ /* We failed to open the relation, so there is nothing to close. */
CommitTransactionCommand();
- }
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
- {
- Oid reloid;
-
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.database != database ||
+ blk.filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ blk.filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk.forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk.forknum <= InvalidForkNumber ||
+ blk.forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk.forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.database != database ||
+ blk.filenumber != filenumber ||
+ blk.forknum != forknum)
+ break;
+ }
+
+ /* Time to check if this newfound fork is valid */
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ /* Prewarm buffers. */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.database == database &&
+ blk.filenumber == filenumber &&
+ blk.forknum == forknum &&
+ have_free_buffer())
+ {
+ CHECK_FOR_INTERRUPTS();
- old_blk = blk;
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= nblocks)
+ {
+ blk = block_info[++i];
+ continue;
+ }
- dsm_detach(seg);
+ buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
+ NULL);
+
+ blk = block_info[++i];
+ if (!BufferIsValid(buf))
+ break;
+
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
On 03/04/2025 17:31, Melanie Plageman wrote:
Attached v12 fixes a bug Bilal found off-list in 0002 related to
handling invalid blocks.
I had a quick look at this. Looks good overall, some small remarks:
v12-0001-Autoprewarm-global-objects-separately.patch
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.
So it effectively treats "global objects" as one extra database,
launching a separate worker process to handle global objects. It took me
a while to understand that. From the commit message, I understood that
it still does that within the first worker process invocation, but no. A
comment somewhere would be good.
One extra worker process invocation is obviously not an improvement
performance-wise, but seems acceptable.
v12-0002-Refactor-autoprewarm_database_main-in-preparatio.patch
Yes, I agree this makes the logic more clear
v12-0003-Use-streaming-read-I-O-in-autoprewarm.patch
I wonder if the have_free_buffer() calls work correctly with read
streams? Or will you "overshoot", prewarming a few more pages after the
buffer cache is already full? I guess that depends on when exactly the
read stream code allocates the buffer.
While reviewing this, I noticed a pre-existing bug: The code ignores
'tablespace' when deciding if it's reached the end of the current
relation. I believe it's possible to have two different relations with
the same relnumber, in different tablespaces.
--
Heikki Linnakangas
Neon (https://neon.tech)
On Thu, Apr 3, 2025 at 11:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:
I had a quick look at this. Looks good overall, some small remarks:
Thanks for taking a look!
v12-0001-Autoprewarm-global-objects-separately.patch
Instead, modify apw_load_buffers() to prewarm the shared objects in one
invocation of autoprewarm_database_main() while connected to the first
valid database.So it effectively treats "global objects" as one extra database,
launching a separate worker process to handle global objects. It took me
a while to understand that. From the commit message, I understood that
it still does that within the first worker process invocation, but no. A
comment somewhere would be good.
Yea, I could have been more explicit about that.
Actually, I was chatting about this with Andres off-list and he was
like, why do you need to check the database at all? Won't
prewarm_stop_idx already have that built in? And I think he's right.
In attached v13, I've added a separate patch (0002) which turns this
check into an assert. And I removed the check from all of the other
loops in the later patches.
v12-0003-Use-streaming-read-I-O-in-autoprewarm.patch
I wonder if the have_free_buffer() calls work correctly with read
streams? Or will you "overshoot", prewarming a few more pages after the
buffer cache is already full? I guess that depends on when exactly the
read stream code allocates the buffer.
It does have some overshoot -- but a max of io_combine_limit blocks
will be evicted. The read stream code builds up an IO of up to
io_combine_limit blocks before calling StartReadBuffer(). So you could
be in a situation where you weren't quite out of buffers on the
freelist while you are building up the IO and then when you go to pin
those buffers, there aren't enough on the freelist. But I think that's
okay.
While reviewing this, I noticed a pre-existing bug: The code ignores
'tablespace' when deciding if it's reached the end of the current
relation. I believe it's possible to have two different relations with
the same relnumber, in different tablespaces.
Good catch. I've included a fix for this in the attached set (0001)
- Melanie
Attachments:
v13-0001-Fix-autoprewarm-neglect-of-tablespaces.patchtext/x-patch; charset=US-ASCII; name=v13-0001-Fix-autoprewarm-neglect-of-tablespaces.patchDownload
From 9d5e4bf6a05d0a3f2838dd9efa8715b05be77423 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 3 Apr 2025 12:47:19 -0400
Subject: [PATCH v13 1/4] Fix autoprewarm neglect of tablespaces
While prewarming blocks from a dump file, autoprewarm_database_main()
mistakenly ignored tablespace when detecting the beginning of the next
relation to prewarm. Because RelFileNumbers are only unqiue within a
tablespace, autoprewarm could miss prewarming blocks from a
relation with the same RelFileNumber in a different tablespace.
Though this situation is likely rare in practice, it's best to make the
code correct. Do so by explicitly checking for the RelFileNumber when
detecting a new relation.
Reported-by: Heikki Linnakangas <hlinnaka@iki.fi>
Discussion: https://postgr.es/m/97c36982-603b-494a-95f4-aaf2a12ac27e%40iki.fi
---
contrib/pg_prewarm/autoprewarm.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..760b1548eff 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -472,10 +472,15 @@ autoprewarm_database_main(Datum main_arg)
/*
* As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
+ * relation. RelFileNumbers are only guaranteed to be unique within a
+ * tablespace, so check that too.
+ *
+ * Note that rel will be NULL if try_relation_open failed previously;
+ * in that case, there is nothing to close.
*/
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
+ if (old_blk != NULL &&
+ (old_blk->tablespace != blk->tablespace ||
+ old_blk->filenumber != blk->filenumber) &&
rel != NULL)
{
relation_close(rel, AccessShareLock);
@@ -487,7 +492,9 @@ autoprewarm_database_main(Datum main_arg)
* Try to open each new relation, but only once, when we first
* encounter it. If it's been dropped, skip the associated blocks.
*/
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
+ if (old_blk == NULL ||
+ old_blk->tablespace != blk->tablespace ||
+ old_blk->filenumber != blk->filenumber)
{
Oid reloid;
@@ -508,6 +515,7 @@ autoprewarm_database_main(Datum main_arg)
/* Once per fork, check for fork existence and size. */
if (old_blk == NULL ||
+ old_blk->tablespace != blk->tablespace ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
{
--
2.34.1
v13-0002-Remove-superfluous-autoprewarm-check.patchtext/x-patch; charset=US-ASCII; name=v13-0002-Remove-superfluous-autoprewarm-check.patchDownload
From 8879473d355dba5c9397c398240db2e1efb81a4c Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 3 Apr 2025 14:54:09 -0400
Subject: [PATCH v13 2/4] Remove superfluous autoprewarm check
autoprewarm_database_main() prewarms blocks from the same database. It
is passed an array of sorted BlockInfoRecords and a start and stop index
into the array. The range represented should include only blocks
belonging to global objects or blocks from a single database. Remove an
unnecessary check that the current block is from the same database and
add an assert to ensure this invariant remains.
---
contrib/pg_prewarm/autoprewarm.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 760b1548eff..5f6dca57cdd 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -463,12 +463,10 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
/*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
+ * All blocks between prewarm_start_idx and prewarm_stop_idx should
+ * belong either to global objects or the same database.
*/
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ Assert(blk->database == apw_state->database || blk->database == 0);
/*
* As soon as we encounter a block of a new relation, close the old
--
2.34.1
v13-0004-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v13-0004-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From 54c9d4db9e3c3bddcb02cb515a1dd79ba4a83cf2 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v13 4/4] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 127 +++++++++++++++++++++++++------
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 105 insertions(+), 23 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 0d59fd62e93..37014e648f4 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * `pos` is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid tablespace;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -422,6 +445,55 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (!have_free_buffer())
+ {
+ p->pos = apw_state->prewarm_stop_idx;
+ return InvalidBlockNumber;
+ }
+
+ if (blk.tablespace != p->tablespace)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ p->pos++;
+
+ /*
+ * Check whether blocknum is valid and within fork file size.
+ * Fast-forward through any invalid blocks. We want `p->pos` to
+ * reflect the location of the next relation or fork before ending the
+ * stream.
+ */
+ if (blk.blocknum >= p->nblocks)
+ continue;
+
+ return blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -462,8 +534,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
/*
* All blocks between prewarm_start_idx and prewarm_stop_idx should
* belong either to global objects or the same database.
@@ -510,6 +580,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk.forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -540,32 +612,41 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffers. */
- while (i < apw_state->prewarm_stop_idx &&
- blk.tablespace == tablespace &&
- blk.filenumber == filenumber &&
- blk.forknum == forknum &&
- have_free_buffer())
+ p = (struct AutoPrewarmReadStreamData)
{
- CHECK_FOR_INTERRUPTS();
-
- /* Check whether blocknum is valid and within fork file size. */
- if (blk.blocknum >= nblocks)
- {
- blk = block_info[++i];
- continue;
- }
-
- buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
- NULL);
-
- blk = block_info[++i];
- if (!BufferIsValid(buf))
- break;
+ .block_info = block_info,
+ .pos = i,
+ .tablespace = tablespace,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
+ {
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8f28d8ff28e..5ac290fae78 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
v13-0003-Refactor-autoprewarm_database_main-in-preparatio.patchtext/x-patch; charset=US-ASCII; name=v13-0003-Refactor-autoprewarm_database_main-in-preparatio.patchDownload
From a86d64437239b9453143c85a895c7f19b4ab3cb1 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v13 3/4] Refactor autoprewarm_database_main() in preparation
for read stream
Autoprewarm prewarms blocks from a dump file representing the contents
of shared buffers last time it was dumped. It uses a sorted array of
BlockInfoRecords, each representing a block from one of the cluster's
databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handled this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped.
This method won't work with the read stream API. A read stream can only
be created for a single relation and fork combination. The callback has
to be able to advance the position in the array to allow for reading
ahead additional blocks, however the callback cannot try to open another
relation or close the current relation. So, the main loop in
autoprewarm_database_main() must also advance the position in the array
of BlockInfoRecords.
To make it compatible with the read stream API, change
autoprewarm_database_main() to explicitly fast-forward in the array past
the blocks belonging to an invalid relation or fork.
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 172 +++++++++++++++++--------------
1 file changed, 94 insertions(+), 78 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 5f6dca57cdd..0d59fd62e93 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -429,11 +429,9 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord blk;
dsm_segment *seg;
/* Establish signal handlers; once that's done, unblock signals. */
@@ -449,16 +447,20 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = block_info[i];
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx && have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ Oid tablespace = blk.tablespace;
+ RelFileNumber filenumber = blk.filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
@@ -466,97 +468,111 @@ autoprewarm_database_main(Datum main_arg)
* All blocks between prewarm_start_idx and prewarm_stop_idx should
* belong either to global objects or the same database.
*/
- Assert(blk->database == apw_state->database || blk->database == 0);
+ Assert(blk.database == apw_state->database || blk.database == 0);
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. RelFileNumbers are only guaranteed to be unique within a
- * tablespace, so check that too.
- *
- * Note that rel will be NULL if try_relation_open failed previously;
- * in that case, there is nothing to close.
- */
- if (old_blk != NULL &&
- (old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber) &&
- rel != NULL)
- {
- relation_close(rel, AccessShareLock);
- rel = NULL;
- CommitTransactionCommand();
- }
+ StartTransactionCommand();
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL ||
- old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber)
+ reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- Oid reloid;
+ /* We failed to open the relation, so there is nothing to close. */
+ CommitTransactionCommand();
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if ( blk.tablespace != tablespace ||
+ blk.filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our new found relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.tablespace == tablespace &&
+ blk.filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk.forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk.forknum <= InvalidForkNumber ||
+ blk.forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk.forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.tablespace != tablespace ||
+ blk.filenumber != filenumber ||
+ blk.forknum != forknum)
+ break;
+ }
+
+ /* Time to check if this newfound fork is valid */
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ /* Prewarm buffers. */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.tablespace == tablespace &&
+ blk.filenumber == filenumber &&
+ blk.forknum == forknum &&
+ have_free_buffer())
+ {
+ CHECK_FOR_INTERRUPTS();
- old_blk = blk;
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= nblocks)
+ {
+ blk = block_info[++i];
+ continue;
+ }
- dsm_detach(seg);
+ buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
+ NULL);
+
+ blk = block_info[++i];
+ if (!BufferIsValid(buf))
+ break;
+
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
On 3 Apr 2025, at 21:25, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Thu, Apr 3, 2025 at 11:17 AM Heikki Linnakangas <hlinnaka@iki.fi> wrote:
I had a quick look at this. Looks good overall
Same here, this seemed like a good piece to bite into with my limited AIO
knowledge to learn more, and reading it over it seems like a good change.
A few small comments:
+ * `pos` is the read stream callback's index into block_info. Because the
I'm not a fan of markdown in code comments (also in a few more places).
+ /* Time to try and open our new found relation */
s/new found/newfound/
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ CHECK_FOR_INTERRUPTS();
Isn't checking inside this loop increasing the frequency of checks compared to
the current version?
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
Is there a non programmer-error case where this can happen? The Assert right
after a loop around the same function seems to imply there is a race or toctou
case which if so could use a comment.
--
Daniel Gustafsson
On Thu, Apr 3, 2025 at 4:22 PM Daniel Gustafsson <daniel@yesql.se> wrote:
I had a quick look at this. Looks good overall
Same here, this seemed like a good piece to bite into with my limited AIO
knowledge to learn more, and reading it over it seems like a good change.
Thanks for taking a look!
A few small comments:
+ * `pos` is the read stream callback's index into block_info. Because the
I'm not a fan of markdown in code comments (also in a few more places).
Removed them. I got the idea of doing this to distinguish variable
names in comments from English words. But I can see how it is kind of
distracting -- since it is not common in the codebase.
+ /* Time to try and open our new found relation */
s/new found/newfound/
Fixed
+ while (p->pos < apw_state->prewarm_stop_idx) + { + BlockInfoRecord blk = p->block_info[p->pos]; + + CHECK_FOR_INTERRUPTS(); Isn't checking inside this loop increasing the frequency of checks compared to the current version?
It's unclear. The current version does seem to execute the main while
loop (including the CFI) once per block -- even for blocks that it
doesn't end up reading for whatever reason. Things get murkier with
the read stream code. But I put it in the callback to keep the general
idea of doing a CFI once per block. In attached v14, I moved the CFI
to the top of the callback, outside of the loop, to make that
intention more clear.
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
Is there a non programmer-error case where this can happen? The Assert right
after a loop around the same function seems to imply there is a race or toctou
case which if so could use a comment.
Yep. Good call. At some point one read stream user had this assert
because its invocation of read_stream_buffer() was interleaved with
other stuff, so it wasn't obvious that the stream would be exhausted
when it was time to end it. And the assert helped defend that
invariant against future innovation :) I think I've copy-pasta'd this
assert around for no good reason to other read stream users. I've
removed it in v14 and I should probably do a follow-on commit to
master to remove it from the other places it obviously doesn't belong
and is a confusing distraction for future readers.
- Melanie
Attachments:
v14-0001-Fix-autoprewarm-neglect-of-tablespaces.patchtext/x-patch; charset=US-ASCII; name=v14-0001-Fix-autoprewarm-neglect-of-tablespaces.patchDownload
From 11f706609998aebbeac06e282a6e1d25d3f38fdd Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 3 Apr 2025 12:47:19 -0400
Subject: [PATCH v14 1/4] Fix autoprewarm neglect of tablespaces
While prewarming blocks from a dump file, autoprewarm_database_main()
mistakenly ignored tablespace when detecting the beginning of the next
relation to prewarm. Because RelFileNumbers are only unique within a
tablespace, autoprewarm could miss prewarming blocks from a
relation with the same RelFileNumber in a different tablespace.
Though this situation is likely rare in practice, it's best to make the
code correct. Do so by explicitly checking for the RelFileNumber when
detecting a new relation.
Reported-by: Heikki Linnakangas <hlinnaka@iki.fi>
Discussion: https://postgr.es/m/97c36982-603b-494a-95f4-aaf2a12ac27e%40iki.fi
---
contrib/pg_prewarm/autoprewarm.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 73485a2323c..760b1548eff 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -472,10 +472,15 @@ autoprewarm_database_main(Datum main_arg)
/*
* As soon as we encounter a block of a new relation, close the old
- * relation. Note that rel will be NULL if try_relation_open failed
- * previously; in that case, there is nothing to close.
+ * relation. RelFileNumbers are only guaranteed to be unique within a
+ * tablespace, so check that too.
+ *
+ * Note that rel will be NULL if try_relation_open failed previously;
+ * in that case, there is nothing to close.
*/
- if (old_blk != NULL && old_blk->filenumber != blk->filenumber &&
+ if (old_blk != NULL &&
+ (old_blk->tablespace != blk->tablespace ||
+ old_blk->filenumber != blk->filenumber) &&
rel != NULL)
{
relation_close(rel, AccessShareLock);
@@ -487,7 +492,9 @@ autoprewarm_database_main(Datum main_arg)
* Try to open each new relation, but only once, when we first
* encounter it. If it's been dropped, skip the associated blocks.
*/
- if (old_blk == NULL || old_blk->filenumber != blk->filenumber)
+ if (old_blk == NULL ||
+ old_blk->tablespace != blk->tablespace ||
+ old_blk->filenumber != blk->filenumber)
{
Oid reloid;
@@ -508,6 +515,7 @@ autoprewarm_database_main(Datum main_arg)
/* Once per fork, check for fork existence and size. */
if (old_blk == NULL ||
+ old_blk->tablespace != blk->tablespace ||
old_blk->filenumber != blk->filenumber ||
old_blk->forknum != blk->forknum)
{
--
2.34.1
v14-0003-Refactor-autoprewarm_database_main-in-preparatio.patchtext/x-patch; charset=US-ASCII; name=v14-0003-Refactor-autoprewarm_database_main-in-preparatio.patchDownload
From 915fe5b0db6ea8599bc21f95980a352ae404463d Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 31 Mar 2025 22:02:25 -0400
Subject: [PATCH v14 3/4] Refactor autoprewarm_database_main() in preparation
for read stream
Autoprewarm prewarms blocks from a dump file representing the contents
of shared buffers at the time it was dumped. It uses a sorted array of
BlockInfoRecords, each representing a block from one of the cluster's
databases and tables.
autoprewarm_database_main() prewarms all the blocks from a single
database. It is optimized to ensure we don't try to open the same
relation or fork over and over again if it has been dropped or is
invalid. The main loop handled this by carefully setting various local
variables to sentinel values when a run of blocks should be skipped.
This method won't work with the read stream API. A read stream can only
be created for a single relation and fork combination. The callback has
to be able to advance the position in the array to allow for reading
ahead additional blocks, however the callback cannot try to open another
relation or close the current relation. So, the main loop in
autoprewarm_database_main() must also advance the position in the array
of BlockInfoRecords.
To make it compatible with the read stream API, change
autoprewarm_database_main() to explicitly fast-forward in the array past
the blocks belonging to an invalid relation or fork.
This commit only implements the new control flow -- it does not use the
read stream API.
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Daniel Gustafsson <daniel@yesql.se>
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 172 +++++++++++++++++--------------
1 file changed, 94 insertions(+), 78 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 5f6dca57cdd..761f6a77926 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -429,11 +429,9 @@ apw_load_buffers(void)
void
autoprewarm_database_main(Datum main_arg)
{
- int pos;
BlockInfoRecord *block_info;
- Relation rel = NULL;
- BlockNumber nblocks = 0;
- BlockInfoRecord *old_blk = NULL;
+ int i;
+ BlockInfoRecord blk;
dsm_segment *seg;
/* Establish signal handlers; once that's done, unblock signals. */
@@ -449,16 +447,20 @@ autoprewarm_database_main(Datum main_arg)
errmsg("could not map dynamic shared memory segment")));
BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
block_info = (BlockInfoRecord *) dsm_segment_address(seg);
- pos = apw_state->prewarm_start_idx;
+
+ i = apw_state->prewarm_start_idx;
+ blk = block_info[i];
/*
* Loop until we run out of blocks to prewarm or until we run out of free
* buffers.
*/
- while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ while (i < apw_state->prewarm_stop_idx && have_free_buffer())
{
- BlockInfoRecord *blk = &block_info[pos++];
- Buffer buf;
+ Oid tablespace = blk.tablespace;
+ RelFileNumber filenumber = blk.filenumber;
+ Oid reloid;
+ Relation rel;
CHECK_FOR_INTERRUPTS();
@@ -466,97 +468,111 @@ autoprewarm_database_main(Datum main_arg)
* All blocks between prewarm_start_idx and prewarm_stop_idx should
* belong either to global objects or the same database.
*/
- Assert(blk->database == apw_state->database || blk->database == 0);
+ Assert(blk.database == apw_state->database || blk.database == 0);
- /*
- * As soon as we encounter a block of a new relation, close the old
- * relation. RelFileNumbers are only guaranteed to be unique within a
- * tablespace, so check that too.
- *
- * Note that rel will be NULL if try_relation_open failed previously;
- * in that case, there is nothing to close.
- */
- if (old_blk != NULL &&
- (old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber) &&
- rel != NULL)
- {
- relation_close(rel, AccessShareLock);
- rel = NULL;
- CommitTransactionCommand();
- }
+ StartTransactionCommand();
- /*
- * Try to open each new relation, but only once, when we first
- * encounter it. If it's been dropped, skip the associated blocks.
- */
- if (old_blk == NULL ||
- old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber)
+ reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber);
+ if (!OidIsValid(reloid) ||
+ (rel = try_relation_open(reloid, AccessShareLock)) == NULL)
{
- Oid reloid;
+ /* We failed to open the relation, so there is nothing to close. */
+ CommitTransactionCommand();
- Assert(rel == NULL);
- StartTransactionCommand();
- reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber);
- if (OidIsValid(reloid))
- rel = try_relation_open(reloid, AccessShareLock);
+ /*
+ * Fast-forward to the next relation. We want to skip all of the
+ * other records referencing this relation since we know we can't
+ * open it. That way, we avoid repeatedly trying and failing to
+ * open the same relation.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.tablespace != tablespace ||
+ blk.filenumber != filenumber)
+ break;
+ }
- if (!rel)
- CommitTransactionCommand();
- }
- if (!rel)
- {
- old_blk = blk;
+ /* Time to try and open our newfound relation */
continue;
}
- /* Once per fork, check for fork existence and size. */
- if (old_blk == NULL ||
- old_blk->tablespace != blk->tablespace ||
- old_blk->filenumber != blk->filenumber ||
- old_blk->forknum != blk->forknum)
+ /*
+ * We have a relation; now let's loop until we find a valid fork of
+ * the relation or we run out of free buffers. Once we've read from
+ * all valid forks or run out of options, we'll close the relation and
+ * move on.
+ */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.tablespace == tablespace &&
+ blk.filenumber == filenumber &&
+ have_free_buffer())
{
+ ForkNumber forknum = blk.forknum;
+ BlockNumber nblocks;
+ Buffer buf;
+
/*
* smgrexists is not safe for illegal forknum, hence check whether
* the passed forknum is valid before using it in smgrexists.
*/
- if (blk->forknum > InvalidForkNumber &&
- blk->forknum <= MAX_FORKNUM &&
- smgrexists(RelationGetSmgr(rel), blk->forknum))
- nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
- else
- nblocks = 0;
- }
+ if (blk.forknum <= InvalidForkNumber ||
+ blk.forknum > MAX_FORKNUM ||
+ !smgrexists(RelationGetSmgr(rel), blk.forknum))
+ {
+ /*
+ * Fast-forward to the next fork. We want to skip all of the
+ * other records referencing this fork since we already know
+ * it's not valid.
+ */
+ for (; i < apw_state->prewarm_stop_idx; i++)
+ {
+ blk = block_info[i];
+ if (blk.tablespace != tablespace ||
+ blk.filenumber != filenumber ||
+ blk.forknum != forknum)
+ break;
+ }
+
+ /* Time to check if this newfound fork is valid */
+ continue;
+ }
- /* Check whether blocknum is valid and within fork file size. */
- if (blk->blocknum >= nblocks)
- {
- /* Move to next forknum. */
- old_blk = blk;
- continue;
- }
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffer. */
- buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
- NULL);
- if (BufferIsValid(buf))
- {
- apw_state->prewarmed_blocks++;
- ReleaseBuffer(buf);
- }
+ /* Prewarm buffers. */
+ while (i < apw_state->prewarm_stop_idx &&
+ blk.tablespace == tablespace &&
+ blk.filenumber == filenumber &&
+ blk.forknum == forknum &&
+ have_free_buffer())
+ {
+ CHECK_FOR_INTERRUPTS();
- old_blk = blk;
- }
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk.blocknum >= nblocks)
+ {
+ blk = block_info[++i];
+ continue;
+ }
- dsm_detach(seg);
+ buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
+ NULL);
+
+ blk = block_info[++i];
+ if (!BufferIsValid(buf))
+ break;
+
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+ }
- /* Release lock on previous relation. */
- if (rel)
- {
relation_close(rel, AccessShareLock);
CommitTransactionCommand();
}
+
+ dsm_detach(seg);
}
/*
--
2.34.1
v14-0004-Use-streaming-read-I-O-in-autoprewarm.patchtext/x-patch; charset=US-ASCII; name=v14-0004-Use-streaming-read-I-O-in-autoprewarm.patchDownload
From b259afd02235ac7189e64ec2d54d59c851eab921 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 1 Apr 2025 18:07:38 -0400
Subject: [PATCH v14 4/4] Use streaming read I/O in autoprewarm
Make a read stream for each valid fork of each valid relation
represented in the autoprewarm dump file and prewarm those blocks
through the read stream API instead of by directly invoking
ReadBuffer().
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Co-authored-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Daniel Gustafsson <daniel@yesql.se>
Reviewed-by: Andrey M. Borodin <x4mmm@yandex-team.ru> (earlier versions)
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> (earlier versions)
Reviewed-by: Matheus Alcantara <mths.dev@pm.me> (earlier versions)
Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com
---
contrib/pg_prewarm/autoprewarm.c | 125 +++++++++++++++++++++++++------
src/tools/pgindent/typedefs.list | 1 +
2 files changed, 103 insertions(+), 23 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 761f6a77926..195dc4b773b 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -41,6 +41,7 @@
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/procsignal.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
@@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState
int prewarmed_blocks;
} AutoPrewarmSharedState;
+/*
+ * Private data passed through the read stream API for our use in the
+ * callback.
+ */
+typedef struct AutoPrewarmReadStreamData
+{
+ /* The array of records containing the blocks we should prewarm. */
+ BlockInfoRecord *block_info;
+
+ /*
+ * pos is the read stream callback's index into block_info. Because the
+ * read stream may read ahead, pos is likely to be ahead of the index in
+ * the main loop in autoprewarm_database_main().
+ */
+ int pos;
+ Oid tablespace;
+ RelFileNumber filenumber;
+ ForkNumber forknum;
+ BlockNumber nblocks;
+} AutoPrewarmReadStreamData;
+
+
PGDLLEXPORT void autoprewarm_main(Datum main_arg);
PGDLLEXPORT void autoprewarm_database_main(Datum main_arg);
@@ -422,6 +445,54 @@ apw_load_buffers(void)
apw_state->prewarmed_blocks, num_elements)));
}
+/*
+ * Return the next block number of a specific relation and fork to read
+ * according to the array of BlockInfoRecord.
+ */
+static BlockNumber
+apw_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ AutoPrewarmReadStreamData *p = callback_private_data;
+
+ CHECK_FOR_INTERRUPTS();
+
+ while (p->pos < apw_state->prewarm_stop_idx)
+ {
+ BlockInfoRecord blk = p->block_info[p->pos];
+
+ if (!have_free_buffer())
+ {
+ p->pos = apw_state->prewarm_stop_idx;
+ return InvalidBlockNumber;
+ }
+
+ if (blk.tablespace != p->tablespace)
+ return InvalidBlockNumber;
+
+ if (blk.filenumber != p->filenumber)
+ return InvalidBlockNumber;
+
+ if (blk.forknum != p->forknum)
+ return InvalidBlockNumber;
+
+ p->pos++;
+
+ /*
+ * Check whether blocknum is valid and within fork file size.
+ * Fast-forward through any invalid blocks. We want p->pos to reflect
+ * the location of the next relation or fork before ending the stream.
+ */
+ if (blk.blocknum >= p->nblocks)
+ continue;
+
+ return blk.blocknum;
+ }
+
+ return InvalidBlockNumber;
+}
+
/*
* Prewarm all blocks for one database (and possibly also global objects, if
* those got grouped with this database).
@@ -462,8 +533,6 @@ autoprewarm_database_main(Datum main_arg)
Oid reloid;
Relation rel;
- CHECK_FOR_INTERRUPTS();
-
/*
* All blocks between prewarm_start_idx and prewarm_stop_idx should
* belong either to global objects or the same database.
@@ -510,6 +579,8 @@ autoprewarm_database_main(Datum main_arg)
{
ForkNumber forknum = blk.forknum;
BlockNumber nblocks;
+ struct AutoPrewarmReadStreamData p;
+ ReadStream *stream;
Buffer buf;
/*
@@ -540,32 +611,40 @@ autoprewarm_database_main(Datum main_arg)
nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum);
- /* Prewarm buffers. */
- while (i < apw_state->prewarm_stop_idx &&
- blk.tablespace == tablespace &&
- blk.filenumber == filenumber &&
- blk.forknum == forknum &&
- have_free_buffer())
+ p = (struct AutoPrewarmReadStreamData)
{
- CHECK_FOR_INTERRUPTS();
-
- /* Check whether blocknum is valid and within fork file size. */
- if (blk.blocknum >= nblocks)
- {
- blk = block_info[++i];
- continue;
- }
-
- buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL,
- NULL);
-
- blk = block_info[++i];
- if (!BufferIsValid(buf))
- break;
+ .block_info = block_info,
+ .pos = i,
+ .tablespace = tablespace,
+ .filenumber = filenumber,
+ .forknum = forknum,
+ .nblocks = nblocks,
+ };
+
+ stream = read_stream_begin_relation(READ_STREAM_FULL,
+ NULL,
+ rel,
+ p.forknum,
+ apw_read_stream_next_block,
+ &p,
+ 0);
+ /*
+ * Loop until we've prewarmed all the blocks from this fork. The
+ * read stream callback will check that we still have free buffers
+ * before requesting each block from the read stream API.
+ */
+ while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
+ {
apw_state->prewarmed_blocks++;
ReleaseBuffer(buf);
}
+
+ read_stream_end(stream);
+
+ /* Advance i past all the blocks just prewarmed. */
+ i = p.pos;
+ blk = block_info[i];
}
relation_close(rel, AccessShareLock);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8f28d8ff28e..5ac290fae78 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -175,6 +175,7 @@ AttributeOpts
AuthRequest
AuthToken
AutoPrewarmSharedState
+AutoPrewarmReadStreamData
AutoVacOpts
AutoVacuumShmemStruct
AutoVacuumWorkItem
--
2.34.1
v14-0002-Remove-superfluous-autoprewarm-check.patchtext/x-patch; charset=US-ASCII; name=v14-0002-Remove-superfluous-autoprewarm-check.patchDownload
From 5a72bd6d00f2068e8f03b956dcafcf2fe3be8987 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 3 Apr 2025 14:54:09 -0400
Subject: [PATCH v14 2/4] Remove superfluous autoprewarm check
autoprewarm_database_main() prewarms blocks from the same database. It
is passed an array of sorted BlockInfoRecords and a start and stop index
into the array. The range represented should include only blocks
belonging to global objects or blocks from a single database. Remove an
unnecessary check that the current block is from the same database and
add an assert to ensure this invariant remains.
---
contrib/pg_prewarm/autoprewarm.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 760b1548eff..5f6dca57cdd 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -463,12 +463,10 @@ autoprewarm_database_main(Datum main_arg)
CHECK_FOR_INTERRUPTS();
/*
- * Quit if we've reached records for another database. If previous
- * blocks are of some global objects, then continue pre-warming.
+ * All blocks between prewarm_start_idx and prewarm_stop_idx should
+ * belong either to global objects or the same database.
*/
- if (old_blk != NULL && old_blk->database != blk->database &&
- old_blk->database != 0)
- break;
+ Assert(blk->database == apw_state->database || blk->database == 0);
/*
* As soon as we encounter a block of a new relation, close the old
--
2.34.1
On 3 Apr 2025, at 22:54, Melanie Plageman <melanieplageman@gmail.com> wrote:
On Thu, Apr 3, 2025 at 4:22 PM Daniel Gustafsson <daniel@yesql.se> wrote:
+ while (p->pos < apw_state->prewarm_stop_idx) + { + BlockInfoRecord blk = p->block_info[p->pos]; + + CHECK_FOR_INTERRUPTS(); Isn't checking inside this loop increasing the frequency of checks compared to the current version?It's unclear. The current version does seem to execute the main while
loop (including the CFI) once per block -- even for blocks that it
doesn't end up reading for whatever reason. Things get murkier with
the read stream code. But I put it in the callback to keep the general
idea of doing a CFI once per block. In attached v14, I moved the CFI
to the top of the callback, outside of the loop, to make that
intention more clear.
LGTM.
+ Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
Is there a non programmer-error case where this can happen? The Assert right
after a loop around the same function seems to imply there is a race or toctou
case which if so could use a comment.Yep. Good call. At some point one read stream user had this assert
because its invocation of read_stream_buffer() was interleaved with
other stuff, so it wasn't obvious that the stream would be exhausted
when it was time to end it. And the assert helped defend that
invariant against future innovation :) I think I've copy-pasta'd this
assert around for no good reason to other read stream users. I've
removed it in v14 and I should probably do a follow-on commit to
master to remove it from the other places it obviously doesn't belong
and is a confusing distraction for future readers.
Makes sense, thanks for clarifying and I agree with removing the assertion.
This patch is already marked Ready for Committer and I concur with that.
--
Daniel Gustafsson
Hi,
On Fri, 4 Apr 2025 at 10:59, Daniel Gustafsson <daniel@yesql.se> wrote:
This patch is already marked Ready for Committer and I concur with that.
Same on my end, v14 LGTM.
--
Regards,
Nazir Bilal Yavuz
Microsoft