From 1d3516efd6ee1c18a82f56b87c5afc3d54fb62cf Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Thu, 10 Apr 2025 05:49:45 +0000
Subject: [PATCH v1] Add os_page_num to pg_buffercache

ba2a3c2302f added a way to check if a buffer is spread across multiple pages.

Adding the same information in pg_buffercache so that one does not need NUMA
support enabled to get this information.
---
 .../expected/pg_buffercache.out               |   2 +-
 .../pg_buffercache--1.5--1.6.sql              |   7 ++
 contrib/pg_buffercache/pg_buffercache_pages.c | 118 ++++++++++++++----
 contrib/pg_buffercache/sql/pg_buffercache.sql |   2 +-
 doc/src/sgml/pgbuffercache.sgml               |  34 +++++
 5 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..52e38dcc027 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,5 +1,5 @@
 CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
index 458f054a691..8aa81d15688 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -44,3 +44,10 @@ CREATE FUNCTION pg_buffercache_evict_all(
     OUT buffers_skipped int4)
 AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all'
 LANGUAGE C PARALLEL SAFE VOLATILE;
+
+-- Upgrade view to 1.6. format
+CREATE OR REPLACE VIEW pg_buffercache AS
+    SELECT P.* FROM pg_buffercache_pages() AS P
+    (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+     relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+     pinning_backends int4, os_page_num bigint);
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index e1701bd56ef..b941aba2b7e 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -19,7 +19,7 @@
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM 10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 #define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -54,6 +54,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int64		page_num;
 } BufferCachePagesRec;
 
 
@@ -63,6 +64,9 @@ typedef struct
 typedef struct
 {
 	TupleDesc	tupdesc;
+	int			buffers_per_page;
+	int			pages_per_buffer;
+	int			os_page_size;
 	BufferCachePagesRec *record;
 } BufferCachePagesContext;
 
@@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 
 	if (SRF_IS_FIRSTCALL())
 	{
-		int			i;
+		int			i,
+					idx;
+		Size		os_page_size;
+		char		*startptr;
+		int			pages_per_buffer;
+		int			max_entries;
 
+		/*
+		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+		 * the OS may have different memory page sizes.
+		 *
+		 * To correctly map between them, we need to: 1. Determine the OS memory
+		 * page size 2. Calculate how many OS pages are used by all buffer blocks
+		 * 3. Calculate how many OS pages are contained within each database
+		 * block.
+		 */
+		os_page_size = pg_get_shmem_pagesize();
+
+		/* Initialize the multi-call context, load entries about buffers */
 		funcctx = SRF_FIRSTCALL_INIT();
 
 		/* Switch context when allocating stuff to be used in later calls */
@@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 						   INT2OID, -1, 0);
 
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1))
 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 							   INT4OID, -1, 0);
 
+		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+			TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
+							   INT8OID, -1, 0);
+
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
-		/* Allocate NBuffers worth of BufferCachePagesRec records. */
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
+		max_entries = NBuffers * pages_per_buffer;
+
+		/* Allocate entries for BufferCachePagesRec records. */
 		fctx->record = (BufferCachePagesRec *)
 			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCachePagesRec) * NBuffers);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
+								   sizeof(BufferCachePagesRec) * max_entries);
 
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+		idx = 0;
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 */
 		for (i = 0; i < NBuffers; i++)
 		{
+			char       *buffptr = (char *) BufferGetBlock(i + 1);
 			BufferDesc *bufHdr;
 			uint32		buf_state;
+			int32		page_num;
+			char		*startptr_buff,
+						*endptr_buff;
 
 			bufHdr = GetBufferDescriptor(i);
 			/* Lock each buffer header before inspecting. */
 			buf_state = LockBufHdr(bufHdr);
 
-			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+			/* start of the first page of this buffer */
+			startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
 
-			if (buf_state & BM_DIRTY)
-				fctx->record[i].isdirty = true;
-			else
-				fctx->record[i].isdirty = false;
+			/* end of the buffer (no need to align to memory page) */
+			endptr_buff = buffptr + BLCKSZ;
 
-			/* Note if the buffer is valid, and has storage created */
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				fctx->record[i].isvalid = true;
-			else
-				fctx->record[i].isvalid = false;
+			Assert(startptr_buff < endptr_buff);
+
+			/* calculate ID of the first page for this buffer */
+			page_num = (startptr_buff - startptr) / os_page_size;
+
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
+				fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+				fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
+				fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
+				fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
+				fctx->record[idx].blocknum = bufHdr->tag.blockNum;
+				fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+				fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+				if (buf_state & BM_DIRTY)
+					fctx->record[idx].isdirty = true;
+				else
+					fctx->record[idx].isdirty = false;
+
+				/* Note if the buffer is valid, and has storage created */
+				if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+					fctx->record[idx].isvalid = true;
+				else
+					fctx->record[idx].isvalid = false;
+
+				fctx->record[idx].page_num = page_num;
+				/* advance to the next entry/page */
+				++idx;
+				++page_num;
+			}
 
 			UnlockBufHdr(bufHdr, buf_state);
 		}
+
+		Assert(idx <= max_entries);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[7] = true;
 			/* unused for v1.0 callers, but the array is always long enough */
 			nulls[8] = true;
+			nulls[9] = true;
 		}
 		else
 		{
@@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			/* unused for v1.0 callers, but the array is always long enough */
 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
+			values[9] = Int64GetDatum(fctx->record[i].page_num);
+			nulls[9] = false;
 		}
 
 		/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..78ec7a88ea0 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,6 +1,6 @@
 CREATE EXTENSION pg_buffercache;
 
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..6183adf73db 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -205,6 +205,15 @@
        Number of backends pinning this buffer
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>os_page_num</structfield> <type>bigint</type>
+      </para>
+      <para>
+       number of OS memory page for this buffer
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
  public     | spgist_text_tbl        |     182
 (10 rows)
 
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+             FROM pg_buffercache b JOIN pg_class c
+             ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                b.reldatabase IN (0, (SELECT oid FROM pg_database
+                                      WHERE datname = current_database()))
+             JOIN pg_namespace n ON n.oid = c.relnamespace
+             JOIN (SELECT bufferid FROM pg_buffercache
+                   GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+             GROUP BY n.nspname, c.relname
+             ORDER BY 3 DESC
+             LIMIT 10;
+
+  nspname   |             relname             | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public     | gin_test_tbl                    |                         4
+ public     | delete_test_table               |                         4
+ public     | tenk1                           |                         4
+ pg_catalog | pg_attribute_relid_attnum_index |                         4
+ pg_catalog | pg_class                        |                         2
+ pg_catalog | pg_depend_depender_index        |                         2
+ pg_catalog | pg_attribute                    |                         2
+ pg_catalog | pg_opfamily                     |                         2
+ pg_catalog | pg_opclass_oid_index            |                         2
+ pg_catalog | pg_description                  |                         2
+(10 rows)
 
 regression=# SELECT * FROM pg_buffercache_summary();
  buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
-- 
2.34.1

