Add os_page_num to pg_buffercache
Hi hackers,
I was doing some more tests around ba2a3c2302f (pg_buffercache_numa) and
thought that seeing how buffers are spread across multiple OS pages (if that's
the case) thanks to the os_page_num field is good information to have.
The thing that I think is annoying is that to get this information (os_page_num):
- One needs to use pg_buffercache_numa (which is more costly/slower) than pg_buffercache
- One needs a system with NUMA support enabled
So why not also add this information (os_page_num) in pg_buffercache?
- It would make this information available on all systems, not just NUMA-enabled ones
- It would help understand the memory layout implications of configuration changes
such as database block size, OS page size (huge pages for example) and see how the
buffers are spread across OS pages (if that's the case).
So, please find attached a patch to $SUBJECT then.
Remarks:
- Maybe we could create a helper function to reduce the code duplication between
pg_buffercache_pages() and pg_buffercache_numa_pages()
- I think it would have made sense to also add this information while working
on ba2a3c2302f but (unfortunately) I doubt that this patch is candidate for v18
post freeze (it looks more a feature enhancement than anything else)
- It's currently doing the changes in pg_buffercache v1.6 but will need to
create v1.7 for 19 (if the above stands true)
Looking forward to your feedback,
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v1-0001-Add-os_page_num-to-pg_buffercache.patchtext/x-diff; charset=us-asciiDownload
From 1d3516efd6ee1c18a82f56b87c5afc3d54fb62cf Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Thu, 10 Apr 2025 05:49:45 +0000
Subject: [PATCH v1] Add os_page_num to pg_buffercache
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding the same information in pg_buffercache so that one does not need NUMA
support enabled to get this information.
---
.../expected/pg_buffercache.out | 2 +-
.../pg_buffercache--1.5--1.6.sql | 7 ++
contrib/pg_buffercache/pg_buffercache_pages.c | 118 ++++++++++++++----
contrib/pg_buffercache/sql/pg_buffercache.sql | 2 +-
doc/src/sgml/pgbuffercache.sgml | 34 +++++
5 files changed, 135 insertions(+), 28 deletions(-)
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..52e38dcc027 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,5 +1,5 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
from pg_settings
where name = 'shared_buffers')
from pg_buffercache;
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
index 458f054a691..8aa81d15688 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -44,3 +44,10 @@ CREATE FUNCTION pg_buffercache_evict_all(
OUT buffers_skipped int4)
AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all'
LANGUAGE C PARALLEL SAFE VOLATILE;
+
+-- Upgrade view to 1.6. format
+CREATE OR REPLACE VIEW pg_buffercache AS
+ SELECT P.* FROM pg_buffercache_pages() AS P
+ (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+ relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+ pinning_backends int4, os_page_num bigint);
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index e1701bd56ef..b941aba2b7e 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -19,7 +19,7 @@
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
-#define NUM_BUFFERCACHE_PAGES_ELEM 9
+#define NUM_BUFFERCACHE_PAGES_ELEM 10
#define NUM_BUFFERCACHE_SUMMARY_ELEM 5
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
#define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -54,6 +54,7 @@ typedef struct
* because of bufmgr.c's PrivateRefCount infrastructure.
*/
int32 pinning_backends;
+ int64 page_num;
} BufferCachePagesRec;
@@ -63,6 +64,9 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
+ int buffers_per_page;
+ int pages_per_buffer;
+ int os_page_size;
BufferCachePagesRec *record;
} BufferCachePagesContext;
@@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
if (SRF_IS_FIRSTCALL())
{
- int i;
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int pages_per_buffer;
+ int max_entries;
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+ * the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS memory
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
+ * 3. Calculate how many OS pages are contained within each database
+ * block.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
@@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
INT2OID, -1, 0);
- if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+ if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1))
TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
INT4OID, -1, 0);
+ if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+ TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
+ INT8OID, -1, 0);
+
fctx->tupdesc = BlessTupleDesc(tupledesc);
- /* Allocate NBuffers worth of BufferCachePagesRec records. */
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
+ max_entries = NBuffers * pages_per_buffer;
+
+ /* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCachePagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
- sizeof(BufferCachePagesRec) * NBuffers);
-
- /* Set max calls and remember the user function context. */
- funcctx->max_calls = NBuffers;
- funcctx->user_fctx = fctx;
+ sizeof(BufferCachePagesRec) * max_entries);
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
/*
* Scan through all the buffers, saving the relevant fields in the
* fctx->record structure.
@@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
*/
for (i = 0; i < NBuffers; i++)
{
+ char *buffptr = (char *) BufferGetBlock(i + 1);
BufferDesc *bufHdr;
uint32 buf_state;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
bufHdr = GetBufferDescriptor(i);
/* Lock each buffer header before inspecting. */
buf_state = LockBufHdr(bufHdr);
- fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
- fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
- fctx->record[i].reltablespace = bufHdr->tag.spcOid;
- fctx->record[i].reldatabase = bufHdr->tag.dbOid;
- fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
- fctx->record[i].blocknum = bufHdr->tag.blockNum;
- fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
- fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+ /* start of the first page of this buffer */
+ startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
- if (buf_state & BM_DIRTY)
- fctx->record[i].isdirty = true;
- else
- fctx->record[i].isdirty = false;
+ /* end of the buffer (no need to align to memory page) */
+ endptr_buff = buffptr + BLCKSZ;
- /* Note if the buffer is valid, and has storage created */
- if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
- fctx->record[i].isvalid = true;
- else
- fctx->record[i].isvalid = false;
+ Assert(startptr_buff < endptr_buff);
+
+ /* calculate ID of the first page for this buffer */
+ page_num = (startptr_buff - startptr) / os_page_size;
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
+ fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+ fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
+ fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
+ fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
+ fctx->record[idx].blocknum = bufHdr->tag.blockNum;
+ fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+ fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+ if (buf_state & BM_DIRTY)
+ fctx->record[idx].isdirty = true;
+ else
+ fctx->record[idx].isdirty = false;
+
+ /* Note if the buffer is valid, and has storage created */
+ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+ fctx->record[idx].isvalid = true;
+ else
+ fctx->record[idx].isvalid = false;
+
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
UnlockBufHdr(bufHdr, buf_state);
}
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
}
funcctx = SRF_PERCALL_SETUP();
@@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
nulls[7] = true;
/* unused for v1.0 callers, but the array is always long enough */
nulls[8] = true;
+ nulls[9] = true;
}
else
{
@@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
/* unused for v1.0 callers, but the array is always long enough */
values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
nulls[8] = false;
+ values[9] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[9] = false;
}
/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..78ec7a88ea0 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,6 +1,6 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
from pg_settings
where name = 'shared_buffers')
from pg_buffercache;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..6183adf73db 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -205,6 +205,15 @@
Number of backends pinning this buffer
</para></entry>
</row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ number of OS memory page for this buffer
+ </para></entry>
+ </row>
</tbody>
</tgroup>
</table>
@@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
--
2.34.1
On 4/10/25 15:17, Bertrand Drouvot wrote:
Hi hackers,
I was doing some more tests around ba2a3c2302f (pg_buffercache_numa) and
thought that seeing how buffers are spread across multiple OS pages (if that's
the case) thanks to the os_page_num field is good information to have.The thing that I think is annoying is that to get this information (os_page_num):
- One needs to use pg_buffercache_numa (which is more costly/slower) than pg_buffercache
- One needs a system with NUMA support enabledSo why not also add this information (os_page_num) in pg_buffercache?
- It would make this information available on all systems, not just NUMA-enabled ones
- It would help understand the memory layout implications of configuration changes
such as database block size, OS page size (huge pages for example) and see how the
buffers are spread across OS pages (if that's the case).So, please find attached a patch to $SUBJECT then.
Remarks:
- Maybe we could create a helper function to reduce the code duplication between
pg_buffercache_pages() and pg_buffercache_numa_pages()
- I think it would have made sense to also add this information while working
on ba2a3c2302f but (unfortunately) I doubt that this patch is candidate for v18
post freeze (it looks more a feature enhancement than anything else)
- It's currently doing the changes in pg_buffercache v1.6 but will need to
create v1.7 for 19 (if the above stands true)
This seems like a good idea in principle, but at this point it has to
wait for PG19. Please add it to the July commitfest.
regards
--
Tomas Vondra
On Thu, Apr 10, 2025 at 03:35:24PM +0200, Tomas Vondra wrote:
This seems like a good idea in principle, but at this point it has to
wait for PG19. Please add it to the July commitfest.
+1. From a glance, this seems to fall in the "new feature" bucket and
should likely wait for v19.
--
nathan
Hi,
On Thu, Apr 10, 2025 at 09:58:18AM -0500, Nathan Bossart wrote:
On Thu, Apr 10, 2025 at 03:35:24PM +0200, Tomas Vondra wrote:
This seems like a good idea in principle, but at this point it has to
wait for PG19. Please add it to the July commitfest.+1. From a glance, this seems to fall in the "new feature" bucket and
should likely wait for v19.
Thank you both for providing your thoughts that confirm my initial doubt. Let's
come back to that one later then.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Hi,
On Thu, Apr 10, 2025 at 03:05:29PM +0000, Bertrand Drouvot wrote:
Hi,
On Thu, Apr 10, 2025 at 09:58:18AM -0500, Nathan Bossart wrote:
On Thu, Apr 10, 2025 at 03:35:24PM +0200, Tomas Vondra wrote:
This seems like a good idea in principle, but at this point it has to
wait for PG19. Please add it to the July commitfest.+1. From a glance, this seems to fall in the "new feature" bucket and
should likely wait for v19.Thank you both for providing your thoughts that confirm my initial doubt. Let's
come back to that one later then.
Here we are.
Please find attached a rebased version and while at it, v2 adds a new macro and
a function to avoid some code duplication between pg_buffercache_pages() and
pg_buffercache_numa_pages().
So, PFA:
0001 - Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages() and
pg_buffercache_numa_pages() makes use of them.
0002 - Add os_page_num to pg_buffercache
Making use of the new macro and function from 0001.
As it's for v19, also bumping pg_buffercache's version to 1.7.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v2-0002-Add-os_page_num-to-pg_buffercache.patchtext/x-diff; charset=us-asciiDownload
From 4c5b1aad892240e72d8e18786e7cc409fb5e8f85 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 12:03:55 +0000
Subject: [PATCH v2 2/2] Add os_page_num to pg_buffercache
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding the same information in pg_buffercache so that one does not need NUMA
support enabled to get this information.
---
contrib/pg_buffercache/Makefile | 2 +-
.../expected/pg_buffercache.out | 2 +-
contrib/pg_buffercache/meson.build | 1 +
.../pg_buffercache--1.6--1.7.sql | 11 ++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 108 ++++++++++++++----
contrib/pg_buffercache/sql/pg_buffercache.sql | 2 +-
doc/src/sgml/pgbuffercache.sgml | 34 ++++++
8 files changed, 133 insertions(+), 29 deletions(-)
create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..52e38dcc027 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,5 +1,5 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
from pg_settings
where name = 'shared_buffers')
from pg_buffercache;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..f2ac5d79eb8
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,11 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Upgrade view to 1.7. format
+CREATE OR REPLACE VIEW pg_buffercache AS
+ SELECT P.* FROM pg_buffercache_pages() AS P
+ (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+ relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+ pinning_backends int4, os_page_num bigint);
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..f46de190975 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -19,7 +19,7 @@
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
-#define NUM_BUFFERCACHE_PAGES_ELEM 9
+#define NUM_BUFFERCACHE_PAGES_ELEM 10
#define NUM_BUFFERCACHE_SUMMARY_ELEM 5
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
#define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -60,6 +60,7 @@ typedef struct
* because of bufmgr.c's PrivateRefCount infrastructure.
*/
int32 pinning_backends;
+ int64 page_num;
} BufferCachePagesRec;
@@ -69,6 +70,9 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
+ int buffers_per_page;
+ int pages_per_buffer;
+ int os_page_size;
BufferCachePagesRec *record;
} BufferCachePagesContext;
@@ -152,8 +156,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
if (SRF_IS_FIRSTCALL())
{
- int i;
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+ * while the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS
+ * memory page size 2. Calculate how many OS pages are used by all
+ * buffer blocks 3. Calculate how many OS pages are contained within
+ * each database block.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
@@ -196,24 +216,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
INT2OID, -1, 0);
- if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+ if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1))
TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
INT4OID, -1, 0);
+ if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+ TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
+ INT8OID, -1, 0);
+
fctx->tupdesc = BlessTupleDesc(tupledesc);
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
/* Allocate NBuffers worth of BufferCachePagesRec records. */
fctx->record = (BufferCachePagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
- sizeof(BufferCachePagesRec) * NBuffers);
-
- /* Set max calls and remember the user function context. */
- funcctx->max_calls = NBuffers;
- funcctx->user_fctx = fctx;
+ sizeof(BufferCachePagesRec) * max_entries);
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
/*
* Scan through all the buffers, saving the relevant fields in the
* fctx->record structure.
@@ -224,35 +256,58 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
*/
for (i = 0; i < NBuffers; i++)
{
+ char *buffptr = (char *) BufferGetBlock(i + 1);
BufferDesc *bufHdr;
uint32 buf_state;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
bufHdr = GetBufferDescriptor(i);
/* Lock each buffer header before inspecting. */
buf_state = LockBufHdr(bufHdr);
- fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
- fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
- fctx->record[i].reltablespace = bufHdr->tag.spcOid;
- fctx->record[i].reldatabase = bufHdr->tag.dbOid;
- fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
- fctx->record[i].blocknum = bufHdr->tag.blockNum;
- fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
- fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
- if (buf_state & BM_DIRTY)
- fctx->record[i].isdirty = true;
- else
- fctx->record[i].isdirty = false;
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
+ fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+ fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
+ fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
+ fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
+ fctx->record[idx].blocknum = bufHdr->tag.blockNum;
+ fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+ fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+ if (buf_state & BM_DIRTY)
+ fctx->record[idx].isdirty = true;
+ else
+ fctx->record[idx].isdirty = false;
+
+ /* Note if the buffer is valid, and has storage created */
+ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+ fctx->record[idx].isvalid = true;
+ else
+ fctx->record[idx].isvalid = false;
- /* Note if the buffer is valid, and has storage created */
- if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
- fctx->record[i].isvalid = true;
- else
- fctx->record[i].isvalid = false;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
UnlockBufHdr(bufHdr, buf_state);
}
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
}
funcctx = SRF_PERCALL_SETUP();
@@ -285,6 +340,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
nulls[7] = true;
/* unused for v1.0 callers, but the array is always long enough */
nulls[8] = true;
+ nulls[9] = true;
}
else
{
@@ -305,6 +361,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
/* unused for v1.0 callers, but the array is always long enough */
values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
nulls[8] = false;
+ values[9] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[9] = false;
}
/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..78ec7a88ea0 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,6 +1,6 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
from pg_settings
where name = 'shared_buffers')
from pg_buffercache;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..6183adf73db 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -205,6 +205,15 @@
Number of backends pinning this buffer
</para></entry>
</row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ number of OS memory page for this buffer
+ </para></entry>
+ </row>
</tbody>
</tgroup>
</table>
@@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
--
2.34.1
v2-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From 2520b83d0d4ca9dae9853174ac98473a4c63eb83 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v2 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by pg_buffercache_pages() in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8ef13d74186 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
On 7/1/25 15:45, Bertrand Drouvot wrote:
Hi,
On Thu, Apr 10, 2025 at 03:05:29PM +0000, Bertrand Drouvot wrote:
Hi,
On Thu, Apr 10, 2025 at 09:58:18AM -0500, Nathan Bossart wrote:
On Thu, Apr 10, 2025 at 03:35:24PM +0200, Tomas Vondra wrote:
This seems like a good idea in principle, but at this point it has to
wait for PG19. Please add it to the July commitfest.+1. From a glance, this seems to fall in the "new feature" bucket and
should likely wait for v19.Thank you both for providing your thoughts that confirm my initial doubt. Let's
come back to that one later then.Here we are.
Please find attached a rebased version and while at it, v2 adds a new macro and
a function to avoid some code duplication between pg_buffercache_pages() and
pg_buffercache_numa_pages().So, PFA:
0001 - Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages() and
pg_buffercache_numa_pages() makes use of them.0002 - Add os_page_num to pg_buffercache
Making use of the new macro and function from 0001.
As it's for v19, also bumping pg_buffercache's version to 1.7.
Thanks for the updated patch!
I took a quick look on this, and I doubt we want to change the schema of
pg_buffercache like this. Adding columns is fine, but it seems rather
wrong to change the cardinality. The view is meant to be 1:1 mapping for
buffers, but now suddenly it's 1:1 with memory pages. Or rather (buffer,
page), to be precise.
I think this will break a lot of monitoring queries, and possibly in a
very subtle way - especially on systems with huge pages, where most
buffers will have one row, but then a buffer that happens to be split on
two pages will have two rows. That seems not great.
Just look at the changes needed in regression tests, where the first
test now needs to be
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
from pg_settings
where name = 'shared_buffers')
which seems like a much weaker check.
IMHO it'd be better to have a new view for this info, something like
pg_buffercache_pages, or something like that.
But I'm also starting to question if the patch really is that useful.
Sure, people may not have NUMA support enabled (e.g. on non-linux
platforms), and even if they do the _numa view is quite expensive.
But I don't recall ever asking how the buffers map to memory pages. At
least not before/without the NUMA stuff.
regards
--
Tomas Vondra
Hi,
On Tue, Jul 01, 2025 at 04:31:01PM +0200, Tomas Vondra wrote:
On 7/1/25 15:45, Bertrand Drouvot wrote:
I took a quick look on this,
Thanks for looking at it!
and I doubt we want to change the schema of
pg_buffercache like this. Adding columns is fine, but it seems rather
wrong to change the cardinality. The view is meant to be 1:1 mapping for
buffers, but now suddenly it's 1:1 with memory pages. Or rather (buffer,
page), to be precise.I think this will break a lot of monitoring queries, and possibly in a
very subtle way - especially on systems with huge pages, where most
buffers will have one row, but then a buffer that happens to be split on
two pages will have two rows. That seems not great.IMHO it'd be better to have a new view for this info, something like
pg_buffercache_pages, or something like that.
That's a good point, fully agree!
But I'm also starting to question if the patch really is that useful.
Sure, people may not have NUMA support enabled (e.g. on non-linux
platforms), and even if they do the _numa view is quite expensive.
Yeah, it's not for day to day activities, more for configuration testing and
also for development activity/testing.
For example, If I set BLCKSZ to 8KB and enable huge pages (2MB), then I may
expect to see buffers not spread across pages.
But what I can see is:
SELECT
pages_per_buffer,
COUNT(*) as buffer_count
FROM (
SELECT bufferid, COUNT(*) as pages_per_buffer
FROM pg_buffercache
GROUP BY bufferid
) subq
GROUP BY pages_per_buffer
ORDER BY pages_per_buffer;
pages_per_buffer | buffer_count
------------------+--------------
1 | 261120
2 | 1024
This is due to the shared buffers being aligned to PG_IO_ALIGN_SIZE.
If I change it to:
BufferManagerShmemInit(void)
/* Align buffer pool on IO page size boundary. */
BufferBlocks = (char *)
- TYPEALIGN(PG_IO_ALIGN_SIZE,
+ TYPEALIGN(2 * 1024 * 1024,
ShmemInitStruct("Buffer Blocks",
- NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ NBuffers * (Size) BLCKSZ + (2 * 1024 * 1024),
&foundBufs));
Then I get:
pages_per_buffer | buffer_count
------------------+--------------
1 | 262144
(1 row)
So we've been able to see that some buffers were spread across pages due to
shared buffer alignment on PG_IO_ALIGN_SIZE. And that if we change the alignment
to be set to 2MB then I don't see any buffers spread across pages anymore.
I think that it helps "visualize" some configuration or code changes.
What are your thoughts?
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On 7/1/25 18:34, Bertrand Drouvot wrote:
Hi,
On Tue, Jul 01, 2025 at 04:31:01PM +0200, Tomas Vondra wrote:
On 7/1/25 15:45, Bertrand Drouvot wrote:
I took a quick look on this,
Thanks for looking at it!
and I doubt we want to change the schema of
pg_buffercache like this. Adding columns is fine, but it seems rather
wrong to change the cardinality. The view is meant to be 1:1 mapping for
buffers, but now suddenly it's 1:1 with memory pages. Or rather (buffer,
page), to be precise.I think this will break a lot of monitoring queries, and possibly in a
very subtle way - especially on systems with huge pages, where most
buffers will have one row, but then a buffer that happens to be split on
two pages will have two rows. That seems not great.IMHO it'd be better to have a new view for this info, something like
pg_buffercache_pages, or something like that.That's a good point, fully agree!
But I'm also starting to question if the patch really is that useful.
Sure, people may not have NUMA support enabled (e.g. on non-linux
platforms), and even if they do the _numa view is quite expensive.Yeah, it's not for day to day activities, more for configuration testing and
also for development activity/testing.For example, If I set BLCKSZ to 8KB and enable huge pages (2MB), then I may
expect to see buffers not spread across pages.But what I can see is:
SELECT
pages_per_buffer,
COUNT(*) as buffer_count
FROM (
SELECT bufferid, COUNT(*) as pages_per_buffer
FROM pg_buffercache
GROUP BY bufferid
) subq
GROUP BY pages_per_buffer
ORDER BY pages_per_buffer;pages_per_buffer | buffer_count
------------------+--------------
1 | 261120
2 | 1024This is due to the shared buffers being aligned to PG_IO_ALIGN_SIZE.
If I change it to:
BufferManagerShmemInit(void)
/* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) - TYPEALIGN(PG_IO_ALIGN_SIZE, + TYPEALIGN(2 * 1024 * 1024, ShmemInitStruct("Buffer Blocks", - NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + NBuffers * (Size) BLCKSZ + (2 * 1024 * 1024), &foundBufs));Then I get:
pages_per_buffer | buffer_count
------------------+--------------
1 | 262144
(1 row)So we've been able to see that some buffers were spread across pages due to
shared buffer alignment on PG_IO_ALIGN_SIZE. And that if we change the alignment
to be set to 2MB then I don't see any buffers spread across pages anymore.I think that it helps "visualize" some configuration or code changes.
What are your thoughts?
But isn't the _numa view good enough for this? Sure, you need NUMA
support for it, and it may take a fair amount of time, but how often you
need to do such queries? I don't plan to block improving this use case,
but I'm not sure it's worth the effort.
cheers
--
Tomas Vondra
Hi,
On Tue, Jul 01, 2025 at 06:45:37PM +0200, Tomas Vondra wrote:
On 7/1/25 18:34, Bertrand Drouvot wrote:
But isn't the _numa view good enough for this? Sure, you need NUMA
support for it, and it may take a fair amount of time, but how often you
need to do such queries?
Not that often, but my reasoning was more like:
why people managing engines and/or developing on platform that does not support
libnuma would not deserve access to this information?
I don't plan to block improving this use case,
No worries at all, I do appreciate that you're looking at it and provide feedback
whatever the outcome would be.
but I'm not sure it's worth the effort.
I think that the hard work has already been done while creating
pg_buffercache_numa_pages().
Now it's just a matter of extracting the necessary pieces from pg_buffercache_numa_pages()
so that:
* the new view could make use of it
* the maintenance burden should be low (thanks to code dedeuplication)
* people that don't have access to a platform that supports libnuma can have
access to this information
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On 7/1/25 19:20, Bertrand Drouvot wrote:
Hi,
On Tue, Jul 01, 2025 at 06:45:37PM +0200, Tomas Vondra wrote:
On 7/1/25 18:34, Bertrand Drouvot wrote:
But isn't the _numa view good enough for this? Sure, you need NUMA
support for it, and it may take a fair amount of time, but how often you
need to do such queries?Not that often, but my reasoning was more like:
why people managing engines and/or developing on platform that does not support
libnuma would not deserve access to this information?
True. I always forget we only support libnuma on linux for now.
I don't plan to block improving this use case,
No worries at all, I do appreciate that you're looking at it and provide feedback
whatever the outcome would be.but I'm not sure it's worth the effort.
I think that the hard work has already been done while creating
pg_buffercache_numa_pages().Now it's just a matter of extracting the necessary pieces from pg_buffercache_numa_pages()
so that:* the new view could make use of it
* the maintenance burden should be low (thanks to code dedeuplication)
* people that don't have access to a platform that supports libnuma can have
access to this information
+1
regards
--
Tomas Vondra
Hi,
On Tue, Jul 01, 2025 at 07:46:30PM +0200, Tomas Vondra wrote:
On 7/1/25 19:20, Bertrand Drouvot wrote:
Now it's just a matter of extracting the necessary pieces from pg_buffercache_numa_pages()
so that:* the new view could make use of it
* the maintenance burden should be low (thanks to code dedeuplication)
* people that don't have access to a platform that supports libnuma can have
access to this information+1
PFA attached v3 adding a new view and function to pg_buffercache (both named
pg_buffercache_os_pages). I think the names are fine but not 100% convinced
though.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v3-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From 04d437d7a774d92cdfd1dac2e14d4afc5adafcf6 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v3 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by a new function in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8ef13d74186 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v3-0002-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From 4f3d831c9dd9804123a14567dcaac36dec9ec10f Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v3 2/2] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
---
contrib/pg_buffercache/Makefile | 4 +-
.../expected/pg_buffercache_os_pages.out | 25 +++
contrib/pg_buffercache/meson.build | 2 +
.../pg_buffercache--1.6--1.7.sql | 21 +++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 177 ++++++++++++++++++
.../sql/pg_buffercache_os_pages.sql | 16 ++
doc/src/sgml/pgbuffercache.sgml | 112 ++++++++++-
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 357 insertions(+), 4 deletions(-)
create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..a452b28e6d9 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,10 +9,10 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
-REGRESS = pg_buffercache pg_buffercache_numa
+REGRESS = pg_buffercache pg_buffercache_numa pg_buffercache_os_pages
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
new file mode 100644
index 00000000000..2d3d3185885
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
@@ -0,0 +1,25 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..e2acd10c266 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
@@ -36,6 +37,7 @@ tests += {
'sql': [
'pg_buffercache',
'pg_buffercache_numa',
+ 'pg_buffercache_os_pages',
],
},
}
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT P.* FROM pg_buffercache_os_pages() AS P
+ (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..df7287e1283 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
/*
* Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int64 page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,164 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
*endptr_buff = end_ptr;
}
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+ MemoryContext oldcontext;
+ BufferCacheOsPagesContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+ * while the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS
+ * memory page size 2. Calculate how many OS pages are used by all
+ * buffer blocks 3. Calculate how many OS pages are contained within
+ * each database block.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+ INT8OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheOsPagesRec) * max_entries);
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
+
+ bufHdr = GetBufferDescriptor(i);
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
+ }
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+ values[1] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[1] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
new file mode 100644
index 00000000000..618b96e51b8
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
@@ -0,0 +1,16 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..b9f2bc3dd87 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -37,7 +37,9 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
- function (wrapped in the <structname>pg_buffercache</structname> view),
+ function (wrapped in the <structname>pg_buffercache</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> view), the
<function>pg_buffercache_numa_pages()</function> function (wrapped in the
<structname>pg_buffercache_numa</structname> view), the
<function>pg_buffercache_summary()</function> function, the
@@ -54,6 +56,13 @@
convenient use.
</para>
+ <para>
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. The
+ <structname>pg_buffercache_os_pages</structname> view wraps the function for
+ convenient use.
+ </para>
+
<para>
The <function>pg_buffercache_numa_pages()</function> provides
<acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -234,6 +243,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -550,6 +605,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 220e5a4f6b3..44c937fd08d 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -341,6 +341,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
Hi,
On Wed, Jul 02, 2025 at 06:39:25AM +0000, Bertrand Drouvot wrote:
Hi,
On Tue, Jul 01, 2025 at 07:46:30PM +0200, Tomas Vondra wrote:
On 7/1/25 19:20, Bertrand Drouvot wrote:
Now it's just a matter of extracting the necessary pieces from pg_buffercache_numa_pages()
so that:* the new view could make use of it
* the maintenance burden should be low (thanks to code dedeuplication)
* people that don't have access to a platform that supports libnuma can have
access to this information+1
PFA attached v3 adding a new view and function to pg_buffercache (both named
pg_buffercache_os_pages). I think the names are fine but not 100% convinced
though.
Rebased due to 8eede2c7200.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v4-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From a733e520a4c7f47f9e2921044e936ac18ec457bf Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v4 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by a new function in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8ef13d74186 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v4-0002-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From c762845c00049b418a326cc35c3fe4b51016333d Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v4 2/2] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
---
contrib/pg_buffercache/Makefile | 4 +-
.../expected/pg_buffercache_os_pages.out | 25 +++
contrib/pg_buffercache/meson.build | 2 +
.../pg_buffercache--1.6--1.7.sql | 21 +++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 177 ++++++++++++++++++
.../sql/pg_buffercache_os_pages.sql | 16 ++
doc/src/sgml/pgbuffercache.sgml | 110 +++++++++++
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 356 insertions(+), 3 deletions(-)
create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..a452b28e6d9 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,10 +9,10 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
-REGRESS = pg_buffercache pg_buffercache_numa
+REGRESS = pg_buffercache pg_buffercache_numa pg_buffercache_os_pages
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
new file mode 100644
index 00000000000..2d3d3185885
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
@@ -0,0 +1,25 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..e2acd10c266 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
@@ -36,6 +37,7 @@ tests += {
'sql': [
'pg_buffercache',
'pg_buffercache_numa',
+ 'pg_buffercache_os_pages',
],
},
}
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT P.* FROM pg_buffercache_os_pages() AS P
+ (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..df7287e1283 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
/*
* Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int64 page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,164 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
*endptr_buff = end_ptr;
}
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+ MemoryContext oldcontext;
+ BufferCacheOsPagesContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+ * while the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS
+ * memory page size 2. Calculate how many OS pages are used by all
+ * buffer blocks 3. Calculate how many OS pages are contained within
+ * each database block.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+ INT8OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheOsPagesRec) * max_entries);
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
+
+ bufHdr = GetBufferDescriptor(i);
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
+ }
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+ values[1] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[1] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
new file mode 100644
index 00000000000..618b96e51b8
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
@@ -0,0 +1,16 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 546ace8369e..d9a729f0c09 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -38,6 +38,8 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> view), the
<function>pg_buffercache_numa_pages()</function> function (wrapped in the
<structname>pg_buffercache_numa</structname> view), the
<function>pg_buffercache_summary()</function> function, the
@@ -54,6 +56,13 @@
convenient use.
</para>
+ <para>
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. The
+ <structname>pg_buffercache_os_pages</structname> view wraps the function for
+ convenient use.
+ </para>
+
<para>
The <function>pg_buffercache_numa_pages()</function> function provides
<acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -234,6 +243,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -550,6 +605,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 220e5a4f6b3..44c937fd08d 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -341,6 +341,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: tested, passed
Documentation: tested, passed
Hi Bertrand,
Just tried out your patch, nice work, thought to leave a review as well.
Patch applied successfully on top of commit a27893df4 in master.
Ran the tests in pg_buffercache and they pass including the new ones.
Running "pagesize" on my laptop returns 16384.
test=# SELECT current_setting('block_size');
current_setting
-----------------
8192
(1 row)
Given the above, the results are as expected:
test=# select * from pg_buffercache_os_pages;
bufferid | os_page_num
----------+-------------
1 | 0
2 | 0
3 | 1
4 | 1
5 | 2
6 | 2
I have noticed that pg_buffercache_os_pages would be the 3rd function
which follows the same high-level structure (others being pg_buffercache_pages
and pg_buffercache_numa_pages). I am wondering if this would be let's say
"strike three" - time to consider extracting out a high-level "skeleton" function,
with a couple of slots which would then be provided by the 3 variants.
Kind regards,
Mircea
The new status of this patch is: Waiting on Author
Hi,
On Tue, Jul 08, 2025 at 02:47:34PM +0000, Mircea Cadariu wrote:
The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: tested, passed
Documentation: tested, passedHi Bertrand,
Just tried out your patch, nice work, thought to leave a review as well.
Thanks for looking at it!
Patch applied successfully on top of commit a27893df4 in master.
Ran the tests in pg_buffercache and they pass including the new ones.Running "pagesize" on my laptop returns 16384.
test=# SELECT current_setting('block_size');
current_setting
-----------------
8192
(1 row)Given the above, the results are as expected:
test=# select * from pg_buffercache_os_pages;
bufferid | os_page_num
----------+-------------
1 | 0
2 | 0
3 | 1
4 | 1
5 | 2
6 | 2
Cool.
I have noticed that pg_buffercache_os_pages would be the 3rd function
which follows the same high-level structure (others being pg_buffercache_pages
and pg_buffercache_numa_pages). I am wondering if this would be let's say
"strike three" - time to consider extracting out a high-level "skeleton" function,
with a couple of slots which would then be provided by the 3 variants.
Yeah, I tried to avoid code duplication for the "os pages" related stuff in
v1. I can check if more can be done (outside of the "os pages" related stuff).
Might be done in a dedicated patch though, I mean I don't think that should be
a blocker for this one.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Hi,
Thanks for the prompt reply!
On 09/07/2025 08:37, Bertrand Drouvot wrote:
Yeah, I tried to avoid code duplication for the "os pages" related stuff in
v1. I can check if more can be done (outside of the "os pages" related stuff).Might be done in a dedicated patch though, I mean I don't think that should be
a blocker for this one.
Agreed, if there's any low-hanging fruit to address now that this file
is cracked open, then great. Otherwise, makes sense to leave it for a
separate dedicated patch.
If you don't mind I have some further questions on the patch, see below.
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type");
Is this needed in the new pg_buffercache_os_pages function? I noticed
this check also in the "original" pg_buffercache_pages. There's a
comment there indicating that (if I understand correctly) its purpose is
to handle upgrades from version 1.0, mentioning a field unrelated to
this patch.
If it's needed, shall we consider adding a similar comment as
in pg_buffercache_pages?
+ /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, + * while the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS + * memory page size 2. Calculate how many OS pages are used by all + * buffer blocks 3. Calculate how many OS pages are contained within + * each database block. + */
For step number 3 - should it be the other way around: database blocks
are contained within OS pages?
Kind regards,
Mircea Cadariu
Hi,
On Wed, Jul 09, 2025 at 10:51:16AM +0100, Mircea Cadariu wrote:
If you don't mind I have some further questions on the patch, see below.
Thanks for the feedback/questions!
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type");Is this needed in the new pg_buffercache_os_pages function?
Strictly speaking it is not, we could CreateTemplateTupleDesc(NUM_BUFFERCACHE_OS_PAGES_ELEM)
instead of CreateTemplateTupleDesc(expected_tupledesc->natts). OTOH, it's used
in multiple places in this extension so I think it's ok to keep it that way
for consistency.
I noticed this
check also in the "original" pg_buffercache_pages. There's a comment there
indicating that (if I understand correctly) its purpose is to handle
upgrades from version 1.0, mentioning a field unrelated to this patch.If it's needed, shall we consider adding a similar comment as
in�pg_buffercache_pages?
We don't need the same kind of comment in pg_buffercache_os_pages() because
it's new in 1.7 (so the patch can not "break" a pre-1.7 version of this function
/view).
In the pg_buffercache_pages case, the story is different, it's used to deal
with the pinning_backends fields that has been introduced in 1.1 (see
pg_buffercache--1.0--1.1.sql).
+ /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, + * while the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS + * memory page size 2. Calculate how many OS pages are used by all + * buffer blocks 3. Calculate how many OS pages are contained within + * each database block. + */For step number 3 - should it be the other way around: database blocks are
contained within OS pages?
This comment comes from pg_get_shmem_allocations_numa() and I agree that it
could be misleading: it all depends what the OS and block sizes actually are:
fixed in v5 attached where the wording is almost the same as in
pg_buffercache_numa_pages().
Also I think that it is not correct in pg_get_shmem_allocations_numa(), I think
that it should be something like proposed in [1]/messages/by-id/aH4DDhdiG9Gi0rG7@ip-10-97-1-34.eu-west-3.compute.internal.
[1]: /messages/by-id/aH4DDhdiG9Gi0rG7@ip-10-97-1-34.eu-west-3.compute.internal
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v5-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From 86e01ffff5bca0565fca3e2adf87d8cd9dd7e218 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v5 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by a new function in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8ef13d74186 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v5-0002-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From 2caf769b2202335602e7d2a03b2409787e392e77 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v5 2/2] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
---
contrib/pg_buffercache/Makefile | 4 +-
.../expected/pg_buffercache_os_pages.out | 25 +++
contrib/pg_buffercache/meson.build | 2 +
.../pg_buffercache--1.6--1.7.sql | 21 ++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 181 ++++++++++++++++++
.../sql/pg_buffercache_os_pages.sql | 16 ++
doc/src/sgml/pgbuffercache.sgml | 110 +++++++++++
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 360 insertions(+), 3 deletions(-)
create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..a452b28e6d9 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,10 +9,10 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
-REGRESS = pg_buffercache pg_buffercache_numa
+REGRESS = pg_buffercache pg_buffercache_numa pg_buffercache_os_pages
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
new file mode 100644
index 00000000000..2d3d3185885
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_os_pages.out
@@ -0,0 +1,25 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..e2acd10c266 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
@@ -36,6 +37,7 @@ tests += {
'sql': [
'pg_buffercache',
'pg_buffercache_numa',
+ 'pg_buffercache_os_pages',
],
},
}
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT P.* FROM pg_buffercache_os_pages() AS P
+ (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..946b6ff4077 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
/*
* Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int64 page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,168 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
*endptr_buff = end_ptr;
}
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+ MemoryContext oldcontext;
+ BufferCacheOsPagesContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+
+ /*
+ * The database block size and OS memory page size are unlikely to be
+ * the same. The block size is 1-32KB, the memory page size depends on
+ * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
+ * there are also features like THP etc. Moreover, we don't quite know
+ * how the pages and buffers "align" in memory - the buffers may be
+ * shifted in some way, using more memory pages than necessary.
+ *
+ * So we need to be careful about mapping buffers to memory pages. We
+ * calculate the maximum number of pages a buffer might use, so that
+ * we allocate enough space for the entries. And then we count the
+ * actual number of entries as we scan the buffers.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+ INT8OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheOsPagesRec) * max_entries);
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
+
+ bufHdr = GetBufferDescriptor(i);
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
+ }
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+ values[1] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[1] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
new file mode 100644
index 00000000000..618b96e51b8
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_os_pages.sql
@@ -0,0 +1,16 @@
+-- We expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 546ace8369e..d9a729f0c09 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -38,6 +38,8 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> view), the
<function>pg_buffercache_numa_pages()</function> function (wrapped in the
<structname>pg_buffercache_numa</structname> view), the
<function>pg_buffercache_summary()</function> function, the
@@ -54,6 +56,13 @@
convenient use.
</para>
+ <para>
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. The
+ <structname>pg_buffercache_os_pages</structname> view wraps the function for
+ convenient use.
+ </para>
+
<para>
The <function>pg_buffercache_numa_pages()</function> function provides
<acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -234,6 +243,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -550,6 +605,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ff050e93a50..31ff664a03c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -339,6 +339,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
Hi,
Thanks for the update! I tried v5 and it returns the expected results on
my laptop, same as before.
Just two further remarks for your consideration.
+ <para> + number of OS memory page for this buffer + </para></entry>
Let's capitalize the first letter here.
+-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; +RESET role;
In the existing pg_buffercache.sql there are sections similar to the
above (SET ROLE pg_database_owner/pg_monitor ... RESET role), with a
couple of different SELECT statements within. Should we rather add the
above new SELECTs there, instead of in the new pg_buffercache_os_pages.sql?
Kind regards,
Mircea Cadariu
Hi,
On Thu, Jul 24, 2025 at 10:30:06PM +0800, Mircea Cadariu wrote:
I tried v5 and it returns the expected results on my
laptop, same as before.
Thanks for the review and testing.
Just two further remarks for your consideration.
+ <para> + number of OS memory page for this buffer + </para></entry>Let's capitalize the first letter here.
It's copy/pasted from pg_buffercache_numa, but I agree that both (the one
in pg_buffercache_numa and the new one) should be capitalized (for consistency
with the other views).
Done in the attached.
+-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; +RESET role;In the existing pg_buffercache.sql there are sections similar to the above
(SET ROLE pg_database_owner/pg_monitor ... RESET role), with a couple of
different SELECT statements within. Should we rather add the above new
SELECTs there, instead of in the new pg_buffercache_os_pages.sql?
Yeah, that probably makes more sense, done in the attached.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v6-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From f54304ff1967748319d61145c160df8eaa362d24 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v6 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by a new function in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8ef13d74186 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v6-0002-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From 3e79a2dfab3d578f3d0b16ea743e6a1bb501561a Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v6 2/2] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
In passing, let's capitalyze "number" in the pg_buffercache_numa documentation
definition.
---
contrib/pg_buffercache/Makefile | 2 +-
.../expected/pg_buffercache.out | 18 ++
contrib/pg_buffercache/meson.build | 1 +
.../pg_buffercache--1.6--1.7.sql | 21 ++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 181 ++++++++++++++++++
contrib/pg_buffercache/sql/pg_buffercache.sql | 8 +
doc/src/sgml/pgbuffercache.sgml | 112 ++++++++++-
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 344 insertions(+), 3 deletions(-)
3.7% contrib/pg_buffercache/expected/
58.3% contrib/pg_buffercache/
37.5% doc/src/sgml/
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..26c2d5f5710 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,16 @@ from pg_buffercache;
t
(1 row)
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
ERROR: permission denied for view pg_buffercache
+SELECT * FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
ERROR: permission denied for function pg_buffercache_pages
SELECT * FROM pg_buffercache_summary();
@@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache;
t
(1 row)
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
?column?
----------
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT P.* FROM pg_buffercache_os_pages() AS P
+ (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 8ef13d74186..946b6ff4077 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
/*
* Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int64 page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,168 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
*endptr_buff = end_ptr;
}
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+ MemoryContext oldcontext;
+ BufferCacheOsPagesContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+
+ /*
+ * The database block size and OS memory page size are unlikely to be
+ * the same. The block size is 1-32KB, the memory page size depends on
+ * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
+ * there are also features like THP etc. Moreover, we don't quite know
+ * how the pages and buffers "align" in memory - the buffers may be
+ * shifted in some way, using more memory pages than necessary.
+ *
+ * So we need to be careful about mapping buffers to memory pages. We
+ * calculate the maximum number of pages a buffer might use, so that
+ * we allocate enough space for the entries. And then we count the
+ * actual number of entries as we scan the buffers.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+ INT8OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheOsPagesRec) * max_entries);
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
+
+ bufHdr = GetBufferDescriptor(i);
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
+ }
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+ values[1] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[1] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..3c70ee9ef4a 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,12 @@ select count(*) = (select setting::bigint
where name = 'shared_buffers')
from pg_buffercache;
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
+SELECT * FROM pg_buffercache_os_pages;
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
SELECT * FROM pg_buffercache_summary();
SELECT * FROM pg_buffercache_usage_counts();
@@ -24,6 +31,7 @@ RESET role;
-- Check that pg_monitor is allowed to query view / function
SET ROLE pg_monitor;
SELECT count(*) > 0 FROM pg_buffercache;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index eeb85a0e049..72f179286d6 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -46,6 +46,8 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> view), the
<function>pg_buffercache_numa_pages()</function> function (wrapped in the
<structname>pg_buffercache_numa</structname> view), the
<function>pg_buffercache_summary()</function> function, the
@@ -62,6 +64,13 @@
convenient use.
</para>
+ <para>
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. The
+ <structname>pg_buffercache_os_pages</structname> view wraps the function for
+ convenient use.
+ </para>
+
<para>
The <function>pg_buffercache_numa_pages()</function> function provides
<acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -242,6 +251,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ Number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -278,7 +333,7 @@
<structfield>os_page_num</structfield> <type>bigint</type>
</para>
<para>
- number of OS memory page for this buffer
+ Number of OS memory page for this buffer
</para></entry>
</row>
@@ -558,6 +613,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 4353befab99..da444c156e6 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -339,6 +339,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
Hi,
Thanks! As the most recent changes are only docs and tests, I did not
try out v6 anymore, but just checked the CI result; all green.
I've set the status to Ready for Committer.
Kind regards,
Mircea Cadariu
Hi,
A small addendum might make sense for this patch, given a recent change
to master.
A CHECK_FOR_INTERRUPTS() call was added in several pg_buffercache
functions in commit eab9e4e.
See also the corresponding discussion [1]/messages/by-id/CAHg+QDcejeLx7WunFT3DX6XKh1KshvGKa8F5au8xVhqVvvQPRw@mail.gmail.com.
Shall we add it to the function introduced in this patch as well?
Kind regards,
Mircea Cadariu
[1]: /messages/by-id/CAHg+QDcejeLx7WunFT3DX6XKh1KshvGKa8F5au8xVhqVvvQPRw@mail.gmail.com
/messages/by-id/CAHg+QDcejeLx7WunFT3DX6XKh1KshvGKa8F5au8xVhqVvvQPRw@mail.gmail.com
Hi,
On Thu, Aug 21, 2025 at 12:08:37PM +0100, Mircea Cadariu wrote:
Hi,
A small addendum might make sense for this patch, given a recent change to
master.A CHECK_FOR_INTERRUPTS() call was added in several pg_buffercache functions
in commit eab9e4e.See also the corresponding discussion [1].
Shall we add it to the function introduced in this patch as well?
Yeah, I think so. Thanks for the ping, done in attached.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v7-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From 4bff3d2a898fc84276e18360a209fde69ad860ad Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Tue, 1 Jul 2025 11:38:37 +0000
Subject: [PATCH v7 1/2] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages().
Currently, this is used by pg_buffercache_numa_pages() only but will be
used by a new function in a following commit.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++-----
1 file changed, 37 insertions(+), 13 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 3df04c98959..2963e7dfb3c 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -320,7 +353,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -428,8 +460,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -475,16 +506,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr, buf_state);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v7-0002-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From 7b7b7f37d4e179681ab6157186c86adaf4d6592c Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Wed, 2 Jul 2025 04:33:03 +0000
Subject: [PATCH v7 2/2] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
In passing, let's capitalyze "number" in the pg_buffercache_numa documentation
definition.
---
contrib/pg_buffercache/Makefile | 2 +-
.../expected/pg_buffercache.out | 18 ++
contrib/pg_buffercache/meson.build | 1 +
.../pg_buffercache--1.6--1.7.sql | 21 ++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 183 ++++++++++++++++++
contrib/pg_buffercache/sql/pg_buffercache.sql | 8 +
doc/src/sgml/pgbuffercache.sgml | 112 ++++++++++-
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 346 insertions(+), 3 deletions(-)
3.7% contrib/pg_buffercache/expected/
58.3% contrib/pg_buffercache/
37.4% doc/src/sgml/
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..26c2d5f5710 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,16 @@ from pg_buffercache;
t
(1 row)
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
ERROR: permission denied for view pg_buffercache
+SELECT * FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
ERROR: permission denied for function pg_buffercache_pages
SELECT * FROM pg_buffercache_summary();
@@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache;
t
(1 row)
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
?column?
----------
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..28caf24688f
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,21 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function.
+CREATE FUNCTION pg_buffercache_os_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT P.* FROM pg_buffercache_os_pages() AS P
+ (bufferid integer, os_page_num bigint);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 2963e7dfb3c..df57cc26d15 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 2
/*
* Get the maximum buffer cache entries needed.
@@ -94,12 +95,30 @@ typedef struct
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int64 page_num;
+} BufferCacheOsPagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -139,6 +158,170 @@ get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
*endptr_buff = end_ptr;
}
+/*
+ * Inquire about OS pages mappings for shared buffers.
+ *
+ * Returns each OS memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (if the buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+ MemoryContext oldcontext;
+ BufferCacheOsPagesContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ idx;
+ Size os_page_size;
+ char *startptr;
+ int max_entries;
+
+ /*
+ * The database block size and OS memory page size are unlikely to be
+ * the same. The block size is 1-32KB, the memory page size depends on
+ * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
+ * there are also features like THP etc. Moreover, we don't quite know
+ * how the pages and buffers "align" in memory - the buffers may be
+ * shifted in some way, using more memory pages than necessary.
+ *
+ * So we need to be careful about mapping buffers to memory pages. We
+ * calculate the maximum number of pages a buffer might use, so that
+ * we allocate enough space for the entries. And then we count the
+ * actual number of entries as we scan the buffers.
+ */
+ os_page_size = pg_get_shmem_pagesize();
+
+ /* Initialize the multi-call context, load entries about buffers */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
+
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
+ INT8OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /*
+ * Each buffer needs at least one entry, but it might be offset in
+ * some way, and use one extra entry. So we allocate space for the
+ * maximum number of entries we might need, and then count the exact
+ * number as we're walking buffers. That way we can do it in one pass,
+ * without reallocating memory.
+ */
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
+
+ /* Allocate NBuffers worth of BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheOsPagesRec) * max_entries);
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+ idx = 0;
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+ int32 page_num;
+ char *startptr_buff,
+ *endptr_buff;
+
+ CHECK_FOR_INTERRUPTS();
+
+ bufHdr = GetBufferDescriptor(i);
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
+
+ /* Add an entry for each OS page overlapping with this buffer. */
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+ {
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].page_num = page_num;
+ /* advance to the next entry/page */
+ ++idx;
+ ++page_num;
+ }
+ }
+
+ Assert(idx <= max_entries);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = idx;
+ funcctx->user_fctx = fctx;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+ values[1] = Int64GetDatum(fctx->record[i].page_num);
+ nulls[1] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..3c70ee9ef4a 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,12 @@ select count(*) = (select setting::bigint
where name = 'shared_buffers')
from pg_buffercache;
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
+SELECT * FROM pg_buffercache_os_pages;
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
SELECT * FROM pg_buffercache_summary();
SELECT * FROM pg_buffercache_usage_counts();
@@ -24,6 +31,7 @@ RESET role;
-- Check that pg_monitor is allowed to query view / function
SET ROLE pg_monitor;
SELECT count(*) > 0 FROM pg_buffercache;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index eeb85a0e049..72f179286d6 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -46,6 +46,8 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> view), the
<function>pg_buffercache_numa_pages()</function> function (wrapped in the
<structname>pg_buffercache_numa</structname> view), the
<function>pg_buffercache_summary()</function> function, the
@@ -62,6 +64,13 @@
convenient use.
</para>
+ <para>
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. The
+ <structname>pg_buffercache_os_pages</structname> view wraps the function for
+ convenient use.
+ </para>
+
<para>
The <function>pg_buffercache_numa_pages()</function> function provides
<acronym>NUMA</acronym> node mappings for shared buffer entries. This
@@ -242,6 +251,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ Number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -278,7 +333,7 @@
<structfield>os_page_num</structfield> <type>bigint</type>
</para>
<para>
- number of OS memory page for this buffer
+ Number of OS memory page for this buffer
</para></entry>
</row>
@@ -558,6 +613,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index a13e8162890..d6c1c823fda 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -339,6 +339,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
On Fri, Aug 22, 2025 at 08:48:57AM +0000, Bertrand Drouvot wrote:
Yeah, I think so. Thanks for the ping, done in attached.
The patch has been marked as ready for committer, and I see the value
in providing a view where it is possible to know to which OS page a
given shared buffer is linked to, based on the OS page size and our
shared buffer size. No problem with that.
Now, I am not really a fan of the duplication created here, where most
of the code pg_buffercache_os_pages() is a plain copy-paste of
pg_buffercache_numa_pages(), with the difference being that we want
only os_page_status via a call to pg_numa_query_pages(), something
that we can rely on when pg_numa_init() is able to work. The
copy-paste is not complete, actually, we surely care about the
Assert() done after calling pg_get_shmem_pagesize(), that acts as a
sanity check to make sure that the buffer size and the OS page size
are divisible pieces (one being divisible by the other), and there is
more that would be nice to not duplicate, like the start pointer
location, the comment on top of pg_get_shmem_pagesize(), etc.
It seems to me that it would be simpler to make the allocations and
information of os_page_ptrs and os_page_status conditional depending
on the result of pg_numa_init(), and that we could just fill numa_node
with NULLs if numa is not available in the environment where the query
is run. This includes making pg_numa_touch_mem_if_required()
conditional, of course, not called when pg_numa_init() fails. Or is
there a strong reason where it would be better to rely on an
elog(ERROR) if numa(3) fails, based on numa_available()? The purpose
of these views being monitoring, it is usually easier in my experience
to rely on NULLs rather than facing periodic errors when we don't know
something. That makes JOINs more predictible, for one.
Please note that I don't mind the extra view pg_buffercache_os_pages
that can provide some information that's transparent cross-platform,
but let's make it something that calls pg_buffercache_numa_pages()
instead.
Note 1: the patch failed to compile as we don't need a buffer state
anymore when unlocking a buffer.
Note 2: Nice catch about the description of os_page_num, applied this
one separately.
--
Michael
Hi,
On Tue, Nov 18, 2025 at 02:39:36PM +0900, Michael Paquier wrote:
On Fri, Aug 22, 2025 at 08:48:57AM +0000, Bertrand Drouvot wrote:
Yeah, I think so. Thanks for the ping, done in attached.
The patch has been marked as ready for committer, and I see the value
in providing a view where it is possible to know to which OS page a
given shared buffer is linked to, based on the OS page size and our
shared buffer size. No problem with that.
Thanks for looking at it!
Now, I am not really a fan of the duplication created here, where most
of the code pg_buffercache_os_pages() is a plain copy-paste of
pg_buffercache_numa_pages(), with the difference being that we want
only os_page_status via a call to pg_numa_query_pages(),
Agree. 0001 helps to avoid code duplication but I agree that we could do more.
It seems to me that it would be simpler to make the allocations and
information of os_page_ptrs and os_page_status conditional depending
on the result of pg_numa_init(), and that we could just fill numa_node
with NULLs if numa is not available in the environment where the query
is run. This includes making pg_numa_touch_mem_if_required()
conditional, of course, not called when pg_numa_init() fails.
That's a good idea and I think we have 2 options here.
Option 1:
We could simply create the pg_buffercache_os_pages view on top of the
pg_buffercache_numa one. The cons I can think of is that, when numa is available,
then pg_buffercache_os_pages would pay the extra cost that also make
pg_buffercache_numa slow.
Then there is no real benefits for adding a new view, we could just keep
pg_buffercache_numa and fill numa_node with NULLs if numa is not available and
document also the use case (with an example) when numa is not available.
That would achieve the main goal.
Option 2:
Still make changes in pg_buffercache_numa_pages() and fill with NULL when
numa is not available. Then create an helper to do the mapping buffers to OS
pages without any NUMA specific operations.
That way we could create a dedicated view pg_buffercache_os_pages on top of
a new function. No code duplication and the new view would not get the extra
cost if numa is available.
Or is
there a strong reason where it would be better to rely on an
elog(ERROR) if numa(3) fails, based on numa_available()? The purpose
of these views being monitoring, it is usually easier in my experience
to rely on NULLs rather than facing periodic errors when we don't know
something. That makes JOINs more predictible, for one.
Yeah I don't think that's an issue to fill with NULL instead of generating errors.
Specially as it will give added value.
Please note that I don't mind the extra view pg_buffercache_os_pages
that can provide some information that's transparent cross-platform,
but let's make it something that calls pg_buffercache_numa_pages()
instead.
So, I think we have Option 1 and Option 2. Option 1 is very simple and Option
2 would allow to get the desired information with no performance penalties when
numa is availble.
I'm tempted to vote for 1 as I'm not sure the larger code refactoring of option
2 is worth the benefits. Thoughts?
Note 2: Nice catch about the description of os_page_num, applied this
one separately.
Thanks!
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Wed, Nov 19, 2025 at 09:28:09AM +0000, Bertrand Drouvot wrote:
Option 1:
We could simply create the pg_buffercache_os_pages view on top of the
pg_buffercache_numa one. The cons I can think of is that, when numa is available,
then pg_buffercache_os_pages would pay the extra cost that also make
pg_buffercache_numa slow.Then there is no real benefits for adding a new view, we could just keep
pg_buffercache_numa and fill numa_node with NULLs if numa is not available and
document also the use case (with an example) when numa is not available.That would achieve the main goal.
Option 2:
Still make changes in pg_buffercache_numa_pages() and fill with NULL when
numa is not available. Then create an helper to do the mapping buffers to OS
pages without any NUMA specific operations.That way we could create a dedicated view pg_buffercache_os_pages on top of
a new function. No code duplication and the new view would not get the extra
cost if numa is available.
Hmm. I can think about an option 3 here: pg_buffercache outlines the
view pg_buffercache_numa as the primary choice over
pg_buffercache_numa_pages(). So I would suggest a more drastic
strategy, that should not break monitoring queries with the views
being the primary source for the results:
- Rename of pg_buffercache_numa_pages() to pg_buffercache_os_pages(),
that takes in input a boolean argument to decide if numa should be
executed or not.
- Creation of a second view for the OS pages that calls
pg_buffercache_os_pages() without the numa code activated, for the two
attributes that matter.
- Switch the existing view pg_buffercache_numa to call
pg_buffercache_os_pages() with the numa code activated. If NUMA
cannot be set up, elog(ERROR).
--
Michael
Hi,
On Wed, Nov 19, 2025 at 10:49:49PM +0900, Michael Paquier wrote:
On Wed, Nov 19, 2025 at 09:28:09AM +0000, Bertrand Drouvot wrote:
Option 1:
We could simply create the pg_buffercache_os_pages view on top of the
pg_buffercache_numa one. The cons I can think of is that, when numa is available,
then pg_buffercache_os_pages would pay the extra cost that also make
pg_buffercache_numa slow.Then there is no real benefits for adding a new view, we could just keep
pg_buffercache_numa and fill numa_node with NULLs if numa is not available and
document also the use case (with an example) when numa is not available.That would achieve the main goal.
Option 2:
Still make changes in pg_buffercache_numa_pages() and fill with NULL when
numa is not available. Then create an helper to do the mapping buffers to OS
pages without any NUMA specific operations.That way we could create a dedicated view pg_buffercache_os_pages on top of
a new function. No code duplication and the new view would not get the extra
cost if numa is available.Hmm. I can think about an option 3 here: pg_buffercache outlines the
view pg_buffercache_numa as the primary choice over
pg_buffercache_numa_pages(). So I would suggest a more drastic
strategy, that should not break monitoring queries with the views
being the primary source for the results:
- Rename of pg_buffercache_numa_pages() to pg_buffercache_os_pages(),
that takes in input a boolean argument to decide if numa should be
executed or not.
- Creation of a second view for the OS pages that calls
pg_buffercache_os_pages() without the numa code activated, for the two
attributes that matter.
- Switch the existing view pg_buffercache_numa to call
pg_buffercache_os_pages() with the numa code activated. If NUMA
cannot be set up, elog(ERROR).
Love the idea: the new view would not suffer from the numa availability overhead
and the current behavior is kept. Will look at it, thanks!
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Hi,
On Thu, Nov 20, 2025 at 04:59:07PM +0000, Bertrand Drouvot wrote:
On Wed, Nov 19, 2025 at 10:49:49PM +0900, Michael Paquier wrote:
Hmm. I can think about an option 3 here: pg_buffercache outlines the
view pg_buffercache_numa as the primary choice over
pg_buffercache_numa_pages(). So I would suggest a more drastic
strategy, that should not break monitoring queries with the views
being the primary source for the results:
- Rename of pg_buffercache_numa_pages() to pg_buffercache_os_pages(),
that takes in input a boolean argument to decide if numa should be
executed or not.
- Creation of a second view for the OS pages that calls
pg_buffercache_os_pages() without the numa code activated, for the two
attributes that matter.
- Switch the existing view pg_buffercache_numa to call
pg_buffercache_os_pages() with the numa code activated. If NUMA
cannot be set up, elog(ERROR).Love the idea: the new view would not suffer from the numa availability overhead
and the current behavior is kept. Will look at it, thanks!
Here they are:
0001:
Is nothing but the same as the one shared in [1]/messages/by-id/aSBOKX6pLJzumbmF@ip-10-97-1-34.eu-west-3.compute.internal.
0002:
Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries
It's not really needed anymore since we'll avoid code duplication with the
new approach. That said I think they help for code readability so keeping them
(I don't have a strong opinion about it if other prefer not to add them).
0003:
Adding pg_buffercache_numa_pages_internal()
This new function makes NUMA data collection conditional.
It extracts the core current pg_buffercache_numa_pages() logic into an
internal function that accepts a boolean parameter. It's currently only called
with the boolean set to true to serve the pg_buffercache_numa view needs.
It's done that way to ease to review but could be pushed as is.
0004:
Add pg_buffercache_os_pages function and view
The patch:
- renames pg_buffercache_numa_pages_internal() to pg_buffercache_os_pages()
- keep pg_buffercache_numa_pages() as a backward compatibility wrapper
- re-create the pg_buffercache_numa view on top of pg_buffercache_os_pages using
true as argument
- adds doc
- adds test
Remark for the doc: the patch does not show the pg_buffercache_os_pages() parameter.
It just mentions that it exists. I think that's fine given that a) the same is
true for pg_buffercache_evict() and pg_buffercache_evict_relation() (maybe that
should be changed though), b) the only purpose of this function is to be linked
to the pg_buffercache_os_pages and pg_buffercache_numa views.
[1]: /messages/by-id/aSBOKX6pLJzumbmF@ip-10-97-1-34.eu-west-3.compute.internal
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v8-0001-Remove-unused-fields-from-BufferCacheNumaRec.patchtext/x-diff; charset=us-asciiDownload
From 75b4d5cd73c4dd16d762007007bf2986367af133 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 05:53:50 +0000
Subject: [PATCH v8 1/4] Remove unused fields from BufferCacheNumaRec
These were added by ba2a3c2302f and never been used.
Also move (and re-word a bit) a comment that was not at the right place.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index c29b784dfa1..1fe350783b1 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -82,9 +82,6 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
- int buffers_per_page;
- int pages_per_buffer;
- int os_page_size;
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
@@ -368,7 +365,11 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
os_page_status = palloc(sizeof(uint64) * os_page_count);
- /* Fill pointers for all the memory pages. */
+ /*
+ * Fill pointers for all the memory pages. This loop stores into
+ * os_page_ptrs[] and touches (if needed) addresses as input to one
+ * big move_pages(2) inquiry system call.
+ */
idx = 0;
for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
{
@@ -449,10 +450,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* We don't hold the partition locks, so we don't get a consistent
* snapshot across all buffers, but we do grab the buffer header
* locks, so the information of each buffer is self-consistent.
- *
- * This loop touches and stores addresses into os_page_ptrs[] as input
- * to one big move_pages(2) inquiry system call. Basically we ask for
- * all memory pages for NBuffers.
*/
startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
idx = 0;
--
2.34.1
v8-0002-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From d4d8c25ac42d7a98c3b919327ebd4b02f75fd059 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 04:52:08 +0000
Subject: [PATCH v8 2/4] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages() and
help for code readability.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 51 ++++++++++++++-----
1 file changed, 38 insertions(+), 13 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 1fe350783b1..0e062f88a20 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -102,6 +108,34 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -317,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -429,8 +462,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -471,16 +503,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v8-0003-Adding-pg_buffercache_numa_pages_internal.patchtext/x-diff; charset=us-asciiDownload
From 3f9a9d476b68e58f9b8e09f0de8347f07b24ccb2 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 09:10:07 +0000
Subject: [PATCH v8 3/4] Adding pg_buffercache_numa_pages_internal()
This new function makes NUMA data collection conditional.
It extracts the core current pg_buffercache_numa_pages() logic into an
internal function that accepts a boolean parameter. It's currently only called
with the boolean set to true to serve the pg_buffercache_numa view needs.
It will also be called with false in a following commit to serve a new
pg_buffercache_os_pages view needs.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 148 +++++++++++-------
1 file changed, 89 insertions(+), 59 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 0e062f88a20..cc02f7ea9f5 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -88,6 +88,7 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
+ bool include_numa;
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
@@ -318,22 +319,26 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
- * Inquire about NUMA memory mappings for shared buffers.
+ * Internal function to inquire about OS pages mappings for shared buffers,
+ * with optional NUMA information.
*
- * Returns NUMA node ID for each memory page used by the buffer. Buffers may
- * be smaller or larger than OS memory pages. For each buffer we return one
- * entry for each memory page used by the buffer (if the buffer is smaller,
- * it only uses a part of one memory page).
+ * When 'include_numa' is:
+ * - false: Returns buffer to OS page mappings quickly, with numa_node as NULL.
+ * - true: Initializes NUMA and returns numa_node values.
+ *
+ * Buffers may be smaller or larger than OS memory pages. For each buffer we
+ * return one entry for each memory page used by the buffer (if the buffer is
+ * smaller, it only uses a part of one memory page).
*
* We expect both sizes (for buffers and memory pages) to be a power-of-2, so
* one is always a multiple of the other.
*
- * In order to get reliable results we also need to touch memory pages, so
- * that the inquiry about NUMA memory node doesn't return -2 (which indicates
- * unmapped/unallocated pages).
+ * When 'include_numa' is true, in order to get reliable results we also need
+ * to touch memory pages, so that the inquiry about NUMA memory node doesn't
+ * return -2 (which indicates unmapped/unallocated pages).
*/
-Datum
-pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+static Datum
+pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
{
FuncCallContext *funcctx;
MemoryContext oldcontext;
@@ -348,14 +353,14 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
int i,
idx;
Size os_page_size;
- void **os_page_ptrs;
- int *os_page_status;
- uint64 os_page_count;
+ int *os_page_status = NULL;
+ uint64 os_page_count = 0;
int max_entries;
char *startptr,
*endptr;
- if (pg_numa_init() == -1)
+ /* If NUMA information is requested, initialize NUMA support. */
+ if (include_numa && pg_numa_init() == -1)
elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
/*
@@ -383,50 +388,55 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
*/
Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
- /*
- * How many addresses we are going to query? Simply get the page for
- * the first buffer, and first page after the last buffer, and count
- * the pages from that.
- */
- startptr = (char *) TYPEALIGN_DOWN(os_page_size,
- BufferGetBlock(1));
- endptr = (char *) TYPEALIGN(os_page_size,
- (char *) BufferGetBlock(NBuffers) + BLCKSZ);
- os_page_count = (endptr - startptr) / os_page_size;
-
- /* Used to determine the NUMA node for all OS pages at once */
- os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
- os_page_status = palloc(sizeof(uint64) * os_page_count);
-
- /*
- * Fill pointers for all the memory pages. This loop stores into
- * os_page_ptrs[] and touches (if needed) addresses as input to one
- * big move_pages(2) inquiry system call.
- */
- idx = 0;
- for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
+ if (include_numa)
{
- os_page_ptrs[idx++] = ptr;
+ void **os_page_ptrs = NULL;
+
+ /*
+ * How many addresses we are going to query? Simply get the page
+ * for the first buffer, and first page after the last buffer, and
+ * count the pages from that.
+ */
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size,
+ BufferGetBlock(1));
+ endptr = (char *) TYPEALIGN(os_page_size,
+ (char *) BufferGetBlock(NBuffers) + BLCKSZ);
+ os_page_count = (endptr - startptr) / os_page_size;
+
+ /* Used to determine the NUMA node for all OS pages at once */
+ os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
+ os_page_status = palloc(sizeof(uint64) * os_page_count);
+
+ /*
+ * Fill pointers for all the memory pages. This loop stores into
+ * os_page_ptrs[] and touches (if needed) addresses as input to
+ * one big move_pages(2) inquiry system call.
+ */
+ idx = 0;
+ for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
+ {
+ os_page_ptrs[idx++] = ptr;
- /* Only need to touch memory once per backend process lifetime */
- if (firstNumaTouch)
- pg_numa_touch_mem_if_required(ptr);
- }
+ /* Only need to touch memory once per backend process lifetime */
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(ptr);
+ }
- Assert(idx == os_page_count);
+ Assert(idx == os_page_count);
- elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
- "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
+ elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
+ "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
- /*
- * If we ever get 0xff back from kernel inquiry, then we probably have
- * bug in our buffers to OS page mapping code here.
- */
- memset(os_page_status, 0xff, sizeof(int) * os_page_count);
+ /*
+ * If we ever get 0xff back from kernel inquiry, then we probably
+ * have bug in our buffers to OS page mapping code here.
+ */
+ memset(os_page_status, 0xff, sizeof(int) * os_page_count);
- /* Query NUMA status for all the pointers */
- if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
- elog(ERROR, "failed NUMA pages inquiry: %m");
+ /* Query NUMA status for all the pointers */
+ if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry: %m");
+ }
/* Initialize the multi-call context, load entries about buffers */
@@ -454,6 +464,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
INT4OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
+ fctx->include_numa = include_numa;
/*
* Each buffer needs at least one entry, but it might be offset in
@@ -472,7 +483,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
- if (firstNumaTouch)
+ if (include_numa && firstNumaTouch)
elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
/*
@@ -512,7 +523,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
fctx->record[idx].bufferid = bufferid;
fctx->record[idx].page_num = page_num;
- fctx->record[idx].numa_node = os_page_status[page_num];
+ fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
/* advance to the next entry/page */
++idx;
@@ -520,14 +531,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
}
}
- Assert((idx >= os_page_count) && (idx <= max_entries));
+ Assert(idx <= max_entries);
+
+ if (include_numa)
+ Assert(idx >= os_page_count);
/* Set max calls and remember the user function context. */
funcctx->max_calls = idx;
funcctx->user_fctx = fctx;
- /* Remember this backend touched the pages */
- firstNumaTouch = false;
+ /* Remember this backend touched the pages (only relevant for NUMA) */
+ if (include_numa)
+ firstNumaTouch = false;
}
funcctx = SRF_PERCALL_SETUP();
@@ -547,8 +562,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
values[1] = Int64GetDatum(fctx->record[i].page_num);
nulls[1] = false;
- values[2] = Int32GetDatum(fctx->record[i].numa_node);
- nulls[2] = false;
+ if (fctx->include_numa)
+ {
+ values[2] = Int32GetDatum(fctx->record[i].numa_node);
+ nulls[2] = false;
+ }
+ else
+ {
+ values[2] = (Datum) 0;
+ nulls[2] = true;
+ }
/* Build and return the tuple. */
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
@@ -560,6 +583,13 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funcctx);
}
+/* Entry point for extension. */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_numa_pages_internal(fcinfo, true);
+}
+
Datum
pg_buffercache_summary(PG_FUNCTION_ARGS)
{
--
2.34.1
v8-0004-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From b9247d0080b6e93488578a732853b9fc00df11cf Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 05:17:20 +0000
Subject: [PATCH v8 4/4] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
To do so this commit:
- renames pg_buffercache_numa_pages_internal() to pg_buffercache_os_pages()
- keep pg_buffercache_numa_pages() as a backward compatibility wrapper
- re-create the pg_buffercache_numa view on top of pg_buffercache_os_pages using
true as argument
That way we avoid code duplication and the pg_buffercache_os_pages view
does not get performance penalties when NUMA is available.
---
contrib/pg_buffercache/Makefile | 2 +-
.../expected/pg_buffercache.out | 18 +++
contrib/pg_buffercache/meson.build | 1 +
.../pg_buffercache--1.6--1.7.sql | 34 +++++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 59 ++++++---
contrib/pg_buffercache/sql/pg_buffercache.sql | 8 ++
doc/src/sgml/pgbuffercache.sgml | 120 ++++++++++++++++--
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 217 insertions(+), 29 deletions(-)
4.4% contrib/pg_buffercache/expected/
3.2% contrib/pg_buffercache/sql/
38.7% contrib/pg_buffercache/
53.0% doc/src/sgml/
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..26c2d5f5710 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,16 @@ from pg_buffercache;
t
(1 row)
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
ERROR: permission denied for view pg_buffercache
+SELECT * FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
ERROR: permission denied for function pg_buffercache_pages
SELECT * FROM pg_buffercache_summary();
@@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache;
t
(1 row)
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
?column?
----------
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..3cfe723c0c2
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,34 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function with boolean parameter
+-- This function is the core implementation for both OS pages and NUMA queries
+CREATE FUNCTION pg_buffercache_os_pages(IN include_numa boolean,
+ OUT bufferid integer,
+ OUT os_page_num bigint,
+ OUT numa_node integer)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT bufferid, os_page_num
+ FROM pg_buffercache_os_pages(false);
+
+DROP VIEW pg_buffercache_numa;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_numa AS
+ SELECT bufferid, os_page_num, numa_node
+ FROM pg_buffercache_os_pages(true);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages(boolean) FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages(boolean) TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index cc02f7ea9f5..dbd4a601ccb 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -26,7 +26,7 @@
#define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
-#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 3
/*
* Get the maximum buffer cache entries needed.
@@ -73,14 +73,16 @@ typedef struct
} BufferCachePagesContext;
/*
- * Record structure holding the to be exposed cache data.
+ * Record structure holding the to be exposed cache data for OS pages.
+ * This structure is used by pg_buffercache_os_pages() which takes a
+ * boolean parameter to control whether NUMA information is included.
*/
typedef struct
{
uint32 bufferid;
int64 page_num;
int32 numa_node;
-} BufferCacheNumaRec;
+} BufferCacheOsPagesRec;
/*
* Function context for data persisting over repeated calls.
@@ -89,8 +91,8 @@ typedef struct
{
TupleDesc tupdesc;
bool include_numa;
- BufferCacheNumaRec *record;
-} BufferCacheNumaContext;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
@@ -98,6 +100,7 @@ typedef struct
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -319,8 +322,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
- * Internal function to inquire about OS pages mappings for shared buffers,
- * with optional NUMA information.
+ * Inquire about OS pages mappings for shared buffers, with optional NUMA
+ * information.
*
* When 'include_numa' is:
* - false: Returns buffer to OS page mappings quickly, with numa_node as NULL.
@@ -337,12 +340,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
* to touch memory pages, so that the inquiry about NUMA memory node doesn't
* return -2 (which indicates unmapped/unallocated pages).
*/
-static Datum
-pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
MemoryContext oldcontext;
- BufferCacheNumaContext *fctx; /* User function context. */
+ BufferCacheOsPagesContext *fctx; /* User function context. */
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
@@ -350,6 +353,7 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
if (SRF_IS_FIRSTCALL())
{
+ bool include_numa;
int i,
idx;
Size os_page_size;
@@ -359,6 +363,9 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
char *startptr,
*endptr;
+ /* Get the boolean parameter that controls NUMA behavior */
+ include_numa = PG_GETARG_BOOL(0);
+
/* If NUMA information is requested, initialize NUMA support. */
if (include_numa && pg_numa_init() == -1)
elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
@@ -446,12 +453,12 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
- fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
- if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
@@ -475,10 +482,10 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
*/
max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
- /* Allocate entries for BufferCachePagesRec records. */
- fctx->record = (BufferCacheNumaRec *)
+ /* Allocate entries for BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
- sizeof(BufferCacheNumaRec) * max_entries);
+ sizeof(BufferCacheOsPagesRec) * max_entries);
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
@@ -553,8 +560,8 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
if (funcctx->call_cntr < funcctx->max_calls)
{
uint32 i = funcctx->call_cntr;
- Datum values[NUM_BUFFERCACHE_NUMA_ELEM];
- bool nulls[NUM_BUFFERCACHE_NUMA_ELEM];
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
values[0] = Int32GetDatum(fctx->record[i].bufferid);
nulls[0] = false;
@@ -583,11 +590,25 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
SRF_RETURN_DONE(funcctx);
}
-/* Entry point for extension. */
+/* Backward compatibility wrapper. */
Datum
pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
- return pg_buffercache_numa_pages_internal(fcinfo, true);
+ LOCAL_FCINFO(newfcinfo, 1);
+ Datum result;
+
+ /* Initialize the new fcinfo structure. */
+ InitFunctionCallInfoData(*newfcinfo, fcinfo->flinfo, 1, fcinfo->fncollation,
+ NULL, NULL);
+
+ /* Set the include_numa parameter to true. */
+ newfcinfo->args[0].value = BoolGetDatum(true);
+ newfcinfo->args[0].isnull = false;
+
+ /* Call pg_buffercache_os_pages with include_numa=true */
+ result = pg_buffercache_os_pages(newfcinfo);
+
+ return result;
}
Datum
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..3c70ee9ef4a 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,12 @@ select count(*) = (select setting::bigint
where name = 'shared_buffers')
from pg_buffercache;
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
+SELECT * FROM pg_buffercache_os_pages;
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
SELECT * FROM pg_buffercache_summary();
SELECT * FROM pg_buffercache_usage_counts();
@@ -24,6 +31,7 @@ RESET role;
-- Check that pg_monitor is allowed to query view / function
SET ROLE pg_monitor;
SELECT count(*) > 0 FROM pg_buffercache;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 99ad2e68785..67f96e289b2 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -46,8 +46,9 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
- <function>pg_buffercache_numa_pages()</function> function (wrapped in the
- <structname>pg_buffercache_numa</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> and
+ <structname>pg_buffercache_numa</structname> views), the
<function>pg_buffercache_summary()</function> function, the
<function>pg_buffercache_usage_counts()</function> function, the
<function>pg_buffercache_evict()</function> function, the
@@ -63,12 +64,14 @@
</para>
<para>
- The <function>pg_buffercache_numa_pages()</function> function provides
- <acronym>NUMA</acronym> node mappings for shared buffer entries. This
- information is not part of <function>pg_buffercache_pages()</function>
- itself, as it is much slower to retrieve.
- The <structname>pg_buffercache_numa</structname> view wraps the function for
- convenient use.
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. When its argument is <literal>true</literal>,
+ it also provides <acronym>NUMA</acronym> node mappings for shared buffer entries (
+ this information is not part of <function>pg_buffercache_pages()</function>
+ itself, as it is much slower to retrieve).
+ The <structname>pg_buffercache_os_pages</structname> and <structname>pg_buffercache_numa</structname>
+ views wrap the function for convenient use (with its argument set to <literal>false</literal>
+ and <literal>true</literal> respectively).
</para>
<para>
@@ -242,6 +245,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ Number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -558,6 +607,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c751c25a04d..f6a398881f6 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -340,6 +340,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
Hi,
On Fri, Nov 21, 2025 at 11:53:52AM +0000, Bertrand Drouvot wrote:
Hi,
On Thu, Nov 20, 2025 at 04:59:07PM +0000, Bertrand Drouvot wrote:
On Wed, Nov 19, 2025 at 10:49:49PM +0900, Michael Paquier wrote:
Hmm. I can think about an option 3 here: pg_buffercache outlines the
view pg_buffercache_numa as the primary choice over
pg_buffercache_numa_pages(). So I would suggest a more drastic
strategy, that should not break monitoring queries with the views
being the primary source for the results:
- Rename of pg_buffercache_numa_pages() to pg_buffercache_os_pages(),
that takes in input a boolean argument to decide if numa should be
executed or not.
- Creation of a second view for the OS pages that calls
pg_buffercache_os_pages() without the numa code activated, for the two
attributes that matter.
- Switch the existing view pg_buffercache_numa to call
pg_buffercache_os_pages() with the numa code activated. If NUMA
cannot be set up, elog(ERROR).Love the idea: the new view would not suffer from the numa availability overhead
and the current behavior is kept. Will look at it, thanks!Here they are:
Attached a rebase due to 7d9043aee80. Also 0003 has a minor change (as compared
to v8-0004) to avoid this error when creating the 1.6 version with the new code:
postgres=# create extension pg_buffercache version '1.6';
CREATE EXTENSION
postgres=# select count(*) from pg_buffercache_numa;
ERROR: set-valued function called in context that cannot accept a set
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
Attachments:
v9-0001-Introduce-GET_MAX_BUFFER_ENTRIES-and-get_buffer_p.patchtext/x-diff; charset=us-asciiDownload
From e5edf33f0052c1576090e601b9337dea122438cd Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 04:52:08 +0000
Subject: [PATCH v9 1/3] Introduce GET_MAX_BUFFER_ENTRIES and
get_buffer_page_boundaries
Those new macro and function are extracted from pg_buffercache_numa_pages() and
help for code readability.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 51 ++++++++++++++-----
1 file changed, 38 insertions(+), 13 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 7c9ff24fa83..5736e0726bd 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -28,6 +28,12 @@
#define NUM_BUFFERCACHE_NUMA_ELEM 3
+/*
+ * Get the maximum buffer cache entries needed.
+ */
+#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \
+ ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1))
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -102,6 +108,34 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
/* Only need to touch memory once per backend process lifetime */
static bool firstNumaTouch = true;
+/*
+ * Helper function to get buffer page boundaries.
+ *
+ * Given a buffer pointer and OS page size, calculates the start/end
+ * pointers and first page number.
+ */
+static void
+get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr,
+ char **startptr_buff, char **endptr_buff,
+ int32 *page_num)
+{
+ char *start_ptr;
+ char *end_ptr;
+
+ /* start of the first page of this buffer */
+ start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
+
+ /* end of the buffer (no need to align to memory page) */
+ end_ptr = buffptr + BLCKSZ;
+
+ Assert(start_ptr < end_ptr);
+
+ /* calculate ID of the first page for this buffer */
+ *page_num = (start_ptr - startptr) / os_page_size;
+ *startptr_buff = start_ptr;
+ *endptr_buff = end_ptr;
+}
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
@@ -317,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
void **os_page_ptrs;
int *os_page_status;
uint64 os_page_count;
- int pages_per_buffer;
int max_entries;
char *startptr,
*endptr;
@@ -430,8 +463,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
* number as we're walking buffers. That way we can do it in one pass,
* without reallocating memory.
*/
- pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
- max_entries = NBuffers * pages_per_buffer;
+ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
/* Allocate entries for BufferCachePagesRec records. */
fctx->record = (BufferCacheNumaRec *)
@@ -472,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
bufferid = BufferDescriptorGetBuffer(bufHdr);
UnlockBufHdr(bufHdr);
- /* start of the first page of this buffer */
- startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
-
- /* end of the buffer (no need to align to memory page) */
- endptr_buff = buffptr + BLCKSZ;
-
- Assert(startptr_buff < endptr_buff);
-
- /* calculate ID of the first page for this buffer */
- page_num = (startptr_buff - startptr) / os_page_size;
+ /* Get page boundaries for this buffer. */
+ get_buffer_page_boundaries(buffptr, os_page_size, startptr,
+ &startptr_buff, &endptr_buff, &page_num);
/* Add an entry for each OS page overlapping with this buffer. */
for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
--
2.34.1
v9-0002-Adding-pg_buffercache_numa_pages_internal.patchtext/x-diff; charset=us-asciiDownload
From 52ccced8b916179211ce8a80a7f63160da100ab7 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 09:10:07 +0000
Subject: [PATCH v9 2/3] Adding pg_buffercache_numa_pages_internal()
This new function makes NUMA data collection conditional.
It extracts the core current pg_buffercache_numa_pages() logic into an
internal function that accepts a boolean parameter. It's currently only called
with the boolean set to true to serve the pg_buffercache_numa view needs.
It will also be called with false in a following commit to serve a new
pg_buffercache_os_pages view needs.
---
contrib/pg_buffercache/pg_buffercache_pages.c | 150 +++++++++++-------
1 file changed, 90 insertions(+), 60 deletions(-)
100.0% contrib/pg_buffercache/
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 5736e0726bd..c8aeb01797c 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -88,6 +88,7 @@ typedef struct
typedef struct
{
TupleDesc tupdesc;
+ bool include_numa;
BufferCacheNumaRec *record;
} BufferCacheNumaContext;
@@ -318,22 +319,26 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
- * Inquire about NUMA memory mappings for shared buffers.
+ * Internal function to inquire about OS pages mappings for shared buffers,
+ * with optional NUMA information.
*
- * Returns NUMA node ID for each memory page used by the buffer. Buffers may
- * be smaller or larger than OS memory pages. For each buffer we return one
- * entry for each memory page used by the buffer (if the buffer is smaller,
- * it only uses a part of one memory page).
+ * When 'include_numa' is:
+ * - false: Returns buffer to OS page mappings quickly, with numa_node as NULL.
+ * - true: Initializes NUMA and returns numa_node values.
+ *
+ * Buffers may be smaller or larger than OS memory pages. For each buffer we
+ * return one entry for each memory page used by the buffer (if the buffer is
+ * smaller, it only uses a part of one memory page).
*
* We expect both sizes (for buffers and memory pages) to be a power-of-2, so
* one is always a multiple of the other.
*
- * In order to get reliable results we also need to touch memory pages, so
- * that the inquiry about NUMA memory node doesn't return -2 (which indicates
- * unmapped/unallocated pages).
+ * When 'include_numa' is true, in order to get reliable results we also need
+ * to touch memory pages, so that the inquiry about NUMA memory node doesn't
+ * return -2 (which indicates unmapped/unallocated pages).
*/
-Datum
-pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+static Datum
+pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
{
FuncCallContext *funcctx;
MemoryContext oldcontext;
@@ -348,14 +353,14 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
int i,
idx;
Size os_page_size;
- void **os_page_ptrs;
- int *os_page_status;
- uint64 os_page_count;
+ int *os_page_status = NULL;
+ uint64 os_page_count = 0;
int max_entries;
char *startptr,
*endptr;
- if (pg_numa_init() == -1)
+ /* If NUMA information is requested, initialize NUMA support. */
+ if (include_numa && pg_numa_init() == -1)
elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
/*
@@ -383,51 +388,56 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
*/
Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
- /*
- * How many addresses we are going to query? Simply get the page for
- * the first buffer, and first page after the last buffer, and count
- * the pages from that.
- */
- startptr = (char *) TYPEALIGN_DOWN(os_page_size,
- BufferGetBlock(1));
- endptr = (char *) TYPEALIGN(os_page_size,
- (char *) BufferGetBlock(NBuffers) + BLCKSZ);
- os_page_count = (endptr - startptr) / os_page_size;
-
- /* Used to determine the NUMA node for all OS pages at once */
- os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
- os_page_status = palloc(sizeof(uint64) * os_page_count);
-
- /*
- * Fill pointers for all the memory pages. This loop stores and
- * touches (if needed) addresses into os_page_ptrs[] as input to one
- * big move_pages(2) inquiry system call, as done in
- * pg_numa_query_pages().
- */
- idx = 0;
- for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
+ if (include_numa)
{
- os_page_ptrs[idx++] = ptr;
+ void **os_page_ptrs = NULL;
+
+ /*
+ * How many addresses we are going to query? Simply get the page
+ * for the first buffer, and first page after the last buffer, and
+ * count the pages from that.
+ */
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size,
+ BufferGetBlock(1));
+ endptr = (char *) TYPEALIGN(os_page_size,
+ (char *) BufferGetBlock(NBuffers) + BLCKSZ);
+ os_page_count = (endptr - startptr) / os_page_size;
+
+ /* Used to determine the NUMA node for all OS pages at once */
+ os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
+ os_page_status = palloc(sizeof(uint64) * os_page_count);
+
+ /*
+ * Fill pointers for all the memory pages. This loop stores and
+ * touches (if needed) addresses into os_page_ptrs[] as input to
+ * one big move_pages(2) inquiry system call, as done in
+ * pg_numa_query_pages().
+ */
+ idx = 0;
+ for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
+ {
+ os_page_ptrs[idx++] = ptr;
- /* Only need to touch memory once per backend process lifetime */
- if (firstNumaTouch)
- pg_numa_touch_mem_if_required(ptr);
- }
+ /* Only need to touch memory once per backend process lifetime */
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(ptr);
+ }
- Assert(idx == os_page_count);
+ Assert(idx == os_page_count);
- elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
- "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
+ elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
+ "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
- /*
- * If we ever get 0xff back from kernel inquiry, then we probably have
- * bug in our buffers to OS page mapping code here.
- */
- memset(os_page_status, 0xff, sizeof(int) * os_page_count);
+ /*
+ * If we ever get 0xff back from kernel inquiry, then we probably
+ * have bug in our buffers to OS page mapping code here.
+ */
+ memset(os_page_status, 0xff, sizeof(int) * os_page_count);
- /* Query NUMA status for all the pointers */
- if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
- elog(ERROR, "failed NUMA pages inquiry: %m");
+ /* Query NUMA status for all the pointers */
+ if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry: %m");
+ }
/* Initialize the multi-call context, load entries about buffers */
@@ -455,6 +465,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
INT4OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
+ fctx->include_numa = include_numa;
/*
* Each buffer needs at least one entry, but it might be offset in
@@ -473,7 +484,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
- if (firstNumaTouch)
+ if (include_numa && firstNumaTouch)
elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
/*
@@ -513,7 +524,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
fctx->record[idx].bufferid = bufferid;
fctx->record[idx].page_num = page_num;
- fctx->record[idx].numa_node = os_page_status[page_num];
+ fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
/* advance to the next entry/page */
++idx;
@@ -521,14 +532,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
}
}
- Assert((idx >= os_page_count) && (idx <= max_entries));
+ Assert(idx <= max_entries);
+
+ if (include_numa)
+ Assert(idx >= os_page_count);
/* Set max calls and remember the user function context. */
funcctx->max_calls = idx;
funcctx->user_fctx = fctx;
- /* Remember this backend touched the pages */
- firstNumaTouch = false;
+ /* Remember this backend touched the pages (only relevant for NUMA) */
+ if (include_numa)
+ firstNumaTouch = false;
}
funcctx = SRF_PERCALL_SETUP();
@@ -548,8 +563,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
values[1] = Int64GetDatum(fctx->record[i].page_num);
nulls[1] = false;
- values[2] = Int32GetDatum(fctx->record[i].numa_node);
- nulls[2] = false;
+ if (fctx->include_numa)
+ {
+ values[2] = Int32GetDatum(fctx->record[i].numa_node);
+ nulls[2] = false;
+ }
+ else
+ {
+ values[2] = (Datum) 0;
+ nulls[2] = true;
+ }
/* Build and return the tuple. */
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
@@ -561,6 +584,13 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funcctx);
}
+/* Entry point for extension. */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_numa_pages_internal(fcinfo, true);
+}
+
Datum
pg_buffercache_summary(PG_FUNCTION_ARGS)
{
--
2.34.1
v9-0003-Add-pg_buffercache_os_pages-function-and-view.patchtext/x-diff; charset=us-asciiDownload
From 8ce991d8628910eac136afb631cb82807aacbe68 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 21 Nov 2025 05:17:20 +0000
Subject: [PATCH v9 3/3] Add pg_buffercache_os_pages function and view
ba2a3c2302f added a way to check if a buffer is spread across multiple pages.
Adding those new function and view so that one does not need NUMA support
enabled to get this information.
To do so this commit:
- renames pg_buffercache_numa_pages_internal() to pg_buffercache_os_pages_internal()
- keep pg_buffercache_numa_pages() as a backward compatibility wrapper
- re-create the pg_buffercache_numa view on top of pg_buffercache_os_pages using
true as argument
That way we avoid code duplication and the pg_buffercache_os_pages view
does not get performance penalties when NUMA is available.
---
contrib/pg_buffercache/Makefile | 2 +-
.../expected/pg_buffercache.out | 18 +++
contrib/pg_buffercache/meson.build | 1 +
.../pg_buffercache--1.6--1.7.sql | 34 +++++
contrib/pg_buffercache/pg_buffercache.control | 2 +-
contrib/pg_buffercache/pg_buffercache_pages.c | 51 +++++---
contrib/pg_buffercache/sql/pg_buffercache.sql | 8 ++
doc/src/sgml/pgbuffercache.sgml | 120 ++++++++++++++++--
src/tools/pgindent/typedefs.list | 2 +
9 files changed, 210 insertions(+), 28 deletions(-)
4.5% contrib/pg_buffercache/expected/
3.2% contrib/pg_buffercache/sql/
37.5% contrib/pg_buffercache/
54.1% doc/src/sgml/
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
- pg_buffercache--1.5--1.6.sql
+ pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..26c2d5f5710 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,16 @@ from pg_buffercache;
t
(1 row)
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
ERROR: permission denied for view pg_buffercache
+SELECT * FROM pg_buffercache_os_pages;
+ERROR: permission denied for view pg_buffercache_os_pages
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
ERROR: permission denied for function pg_buffercache_pages
SELECT * FROM pg_buffercache_summary();
@@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache;
t
(1 row)
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
+ ?column?
+----------
+ t
+(1 row)
+
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
?column?
----------
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 7cd039a1df9..7c31141881f 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -24,6 +24,7 @@ install_data(
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
'pg_buffercache--1.5--1.6.sql',
+ 'pg_buffercache--1.6--1.7.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..3cfe723c0c2
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,34 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new function with boolean parameter
+-- This function is the core implementation for both OS pages and NUMA queries
+CREATE FUNCTION pg_buffercache_os_pages(IN include_numa boolean,
+ OUT bufferid integer,
+ OUT os_page_num bigint,
+ OUT numa_node integer)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_os_pages AS
+ SELECT bufferid, os_page_num
+ FROM pg_buffercache_os_pages(false);
+
+DROP VIEW pg_buffercache_numa;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_numa AS
+ SELECT bufferid, os_page_num, numa_node
+ FROM pg_buffercache_os_pages(true);
+
+REVOKE ALL ON FUNCTION pg_buffercache_os_pages(boolean) FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages(boolean) TO pg_monitor;
+GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index c8aeb01797c..ae6949a3e0b 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -26,7 +26,7 @@
#define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
-#define NUM_BUFFERCACHE_NUMA_ELEM 3
+#define NUM_BUFFERCACHE_OS_PAGES_ELEM 3
/*
* Get the maximum buffer cache entries needed.
@@ -73,14 +73,16 @@ typedef struct
} BufferCachePagesContext;
/*
- * Record structure holding the to be exposed cache data.
+ * Record structure holding the to be exposed cache data for OS pages.
+ * This structure is used by pg_buffercache_os_pages() which takes a
+ * boolean parameter to control whether NUMA information is included.
*/
typedef struct
{
uint32 bufferid;
int64 page_num;
int32 numa_node;
-} BufferCacheNumaRec;
+} BufferCacheOsPagesRec;
/*
* Function context for data persisting over repeated calls.
@@ -89,8 +91,8 @@ typedef struct
{
TupleDesc tupdesc;
bool include_numa;
- BufferCacheNumaRec *record;
-} BufferCacheNumaContext;
+ BufferCacheOsPagesRec *record;
+} BufferCacheOsPagesContext;
/*
@@ -98,6 +100,7 @@ typedef struct
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
@@ -319,8 +322,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
- * Internal function to inquire about OS pages mappings for shared buffers,
- * with optional NUMA information.
+ * Inquire about OS pages mappings for shared buffers, with optional NUMA
+ * information.
*
* When 'include_numa' is:
* - false: Returns buffer to OS page mappings quickly, with numa_node as NULL.
@@ -338,11 +341,11 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
* return -2 (which indicates unmapped/unallocated pages).
*/
static Datum
-pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
+pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
{
FuncCallContext *funcctx;
MemoryContext oldcontext;
- BufferCacheNumaContext *fctx; /* User function context. */
+ BufferCacheOsPagesContext *fctx; /* User function context. */
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
@@ -447,12 +450,12 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
- fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
+ fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext));
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
- if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
@@ -476,10 +479,10 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
*/
max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size);
- /* Allocate entries for BufferCachePagesRec records. */
- fctx->record = (BufferCacheNumaRec *)
+ /* Allocate entries for BufferCacheOsPagesRec records. */
+ fctx->record = (BufferCacheOsPagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
- sizeof(BufferCacheNumaRec) * max_entries);
+ sizeof(BufferCacheOsPagesRec) * max_entries);
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
@@ -554,8 +557,8 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
if (funcctx->call_cntr < funcctx->max_calls)
{
uint32 i = funcctx->call_cntr;
- Datum values[NUM_BUFFERCACHE_NUMA_ELEM];
- bool nulls[NUM_BUFFERCACHE_NUMA_ELEM];
+ Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
+ bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
values[0] = Int32GetDatum(fctx->record[i].bufferid);
nulls[0] = false;
@@ -584,11 +587,23 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa)
SRF_RETURN_DONE(funcctx);
}
-/* Entry point for extension. */
+Datum
+pg_buffercache_os_pages(PG_FUNCTION_ARGS)
+{
+ bool include_numa;
+
+ /* Get the boolean parameter that controls NUMA behavior */
+ include_numa = PG_GETARG_BOOL(0);
+
+ return pg_buffercache_os_pages_internal(fcinfo, include_numa);
+}
+
+/* Backward compatibility wrapper. */
Datum
pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
- return pg_buffercache_numa_pages_internal(fcinfo, true);
+ /* Call internal function with include_numa=true */
+ return pg_buffercache_os_pages_internal(fcinfo, true);
}
Datum
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..3c70ee9ef4a 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,12 @@ select count(*) = (select setting::bigint
where name = 'shared_buffers')
from pg_buffercache;
+-- For pg_buffercache_os_pages, we expect at least one entry for each buffer
+select count(*) >= (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_os_pages;
+
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
buffers_pinned <= buffers_used
@@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
SELECT * FROM pg_buffercache;
+SELECT * FROM pg_buffercache_os_pages;
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
SELECT * FROM pg_buffercache_summary();
SELECT * FROM pg_buffercache_usage_counts();
@@ -24,6 +31,7 @@ RESET role;
-- Check that pg_monitor is allowed to query view / function
SET ROLE pg_monitor;
SELECT count(*) > 0 FROM pg_buffercache;
+SELECT count(*) > 0 FROM pg_buffercache_os_pages;
SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 99ad2e68785..67f96e289b2 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -46,8 +46,9 @@
<para>
This module provides the <function>pg_buffercache_pages()</function>
function (wrapped in the <structname>pg_buffercache</structname> view), the
- <function>pg_buffercache_numa_pages()</function> function (wrapped in the
- <structname>pg_buffercache_numa</structname> view), the
+ <function>pg_buffercache_os_pages()</function> function (wrapped in the
+ <structname>pg_buffercache_os_pages</structname> and
+ <structname>pg_buffercache_numa</structname> views), the
<function>pg_buffercache_summary()</function> function, the
<function>pg_buffercache_usage_counts()</function> function, the
<function>pg_buffercache_evict()</function> function, the
@@ -63,12 +64,14 @@
</para>
<para>
- The <function>pg_buffercache_numa_pages()</function> function provides
- <acronym>NUMA</acronym> node mappings for shared buffer entries. This
- information is not part of <function>pg_buffercache_pages()</function>
- itself, as it is much slower to retrieve.
- The <structname>pg_buffercache_numa</structname> view wraps the function for
- convenient use.
+ The <function>pg_buffercache_os_pages()</function> function provides OS
+ pages mappings for shared buffer entries. When its argument is <literal>true</literal>,
+ it also provides <acronym>NUMA</acronym> node mappings for shared buffer entries (
+ this information is not part of <function>pg_buffercache_pages()</function>
+ itself, as it is much slower to retrieve).
+ The <structname>pg_buffercache_os_pages</structname> and <structname>pg_buffercache_numa</structname>
+ views wrap the function for convenient use (with its argument set to <literal>false</literal>
+ and <literal>true</literal> respectively).
</para>
<para>
@@ -242,6 +245,52 @@
</para>
</sect2>
+ <sect2 id="pgbuffercache-pg-buffercache-os-pages">
+ <title>The <structname>pg_buffercache_os_pages</structname> View</title>
+
+ <para>
+ The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>.
+ </para>
+
+ <table id="pgbuffercache-os-pages-columns">
+ <title><structname>pg_buffercache_os_pages</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>bufferid</structfield> <type>integer</type>
+ </para>
+ <para>
+ ID, in the range 1..<varname>shared_buffers</varname>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>os_page_num</structfield> <type>bigint</type>
+ </para>
+ <para>
+ Number of OS memory page for this buffer
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ </sect2>
+
<sect2 id="pgbuffercache-pg-buffercache-numa">
<title>The <structname>pg_buffercache_numa</structname> View</title>
@@ -558,6 +607,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
public | spgist_text_tbl | 182
(10 rows)
+regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count
+ FROM (
+ SELECT bufferid, COUNT(*) as pages_per_buffer
+ FROM pg_buffercache_os_pages
+ GROUP BY bufferid
+ )
+ GROUP BY pages_per_buffer
+ ORDER BY pages_per_buffer;
+
+ pages_per_buffer | buffer_count
+------------------+--------------
+ 1 | 261120
+ 2 | 1024
+(2 rows)
+
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+ FROM pg_buffercache b JOIN pg_class c
+ ON b.relfilenode = pg_relation_filenode(c.oid) AND
+ b.reldatabase IN (0, (SELECT oid FROM pg_database
+ WHERE datname = current_database()))
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ JOIN (SELECT bufferid FROM pg_buffercache_os_pages
+ GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+ GROUP BY n.nspname, c.relname
+ ORDER BY 3 DESC
+ LIMIT 10;
+
+ nspname | relname | buffers_on_multiple_pages
+------------+------------------------------+---------------------------
+ public | delete_test_table | 3
+ public | gin_test_idx | 2
+ pg_catalog | pg_depend | 2
+ public | quad_poly_tbl | 2
+ pg_catalog | pg_depend_reference_index | 1
+ pg_catalog | pg_index_indexrelid_index | 1
+ pg_catalog | pg_constraint_contypid_index | 1
+ pg_catalog | pg_statistic | 1
+ pg_catalog | pg_depend_depender_index | 1
+ pg_catalog | pg_operator | 1
+(10 rows)
+
+
+ nspname | relname | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public | gin_test_tbl | 4
+ public | delete_test_table | 4
+ public | tenk1 | 4
+ pg_catalog | pg_attribute_relid_attnum_index | 4
+ pg_catalog | pg_class | 2
+ pg_catalog | pg_depend_depender_index | 2
+ pg_catalog | pg_attribute | 2
+ pg_catalog | pg_opfamily | 2
+ pg_catalog | pg_opclass_oid_index | 2
+ pg_catalog | pg_description | 2
+(10 rows)
regression=# SELECT * FROM pg_buffercache_summary();
buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 0d1ea4ec63d..f05872d6860 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -340,6 +340,8 @@ BufferAccessStrategy
BufferAccessStrategyType
BufferCacheNumaContext
BufferCacheNumaRec
+BufferCacheOsPagesContext
+BufferCacheOsPagesRec
BufferCachePagesContext
BufferCachePagesRec
BufferDesc
--
2.34.1
On Sun, Nov 23, 2025 at 09:15:31AM +0000, Bertrand Drouvot wrote:
Attached a rebase due to 7d9043aee80. Also 0003 has a minor change (as compared
to v8-0004) to avoid this error when creating the 1.6 version with the new code:
Yes, sorry, I forgot to mention that part. I have played with the
patch for a couple of hours, fixed a couple of issues, rewording and
tweaking things while browsing the whole (typedefs.list was incorrect,
docs were partially incorrect), and applied the result. I did not see
a point in 0001, as well, because the refactored "internal" function
we'd have just one caller for the proposed macro and function.
The original function pg_buffercache_numa_pages could be dropped when
upgrading to v1.7 now that the view pg_buffercache_numa relies on the
new SQL function pg_buffercache_os_pages(boolean), but I could not be
really excited about that.. We could add a DROP FUNCTION, of course.
By the way, thanks for the effort of splitting up things. This was
super useful for the review and when dealing with each part of the
proposed changes.
--
Michael