Use streaming read API in pgstattuple.

Started by Kirill Reshkeabout 1 year ago8 messages

reshkekirill@gmail.com

about 1 year ago

1 attachment(s)

Hi!

While reviewing other threads implementing stream API for various core
subsystems, I spotted that pgstattuple could also benefit from that.
So, PFA.

Notice refactoring around pgstat_hash_page and changes in pgstat_page
signature. This is needed because pgstat_hash_tuple uses
_hash_getbuf_with_strategy rather than ReadBufferExtended, which is
OK, but makes adapting the streaming API impossible. So, I changed
this place. Old codes should behave exactly the same way as new ones.
I eliminated the additional check that _hash_getbuf_with_strategy
performed, which was blkno == P_NEW. However, since the streaming API
already delivers the right buffer, I think it's okay.

This change behaves sanely on my by-hand testing.

--
Best regards,
Kirill Reshke

Attachments:

v1-0001-Use-stream-read-interface-for-pgstattuple-routine.patchapplication/octet-stream; name=v1-0001-Use-stream-read-interface-for-pgstattuple-routine.patchDownload

From 5685f97cf59311922129ba6cd124e85c624846c5 Mon Sep 17 00:00:00 2001
From: reshke kirill <reshke@double.cloud>
Date: Mon, 25 Nov 2024 18:03:44 +0000
Subject: [PATCH v1] Use stream read interface for pgstattuple routines.

Patch implements new streaming read API for pgstattuple contrib
extension.

This is not perfomance speedup patch.
This patch enables this extension to benefit from stream API in the future
without introducing any performance improvements.
---
 contrib/pgstattuple/pgstattuple.c | 55 ++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index 48cb8f59c4f..cea3aa83619 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -61,22 +61,18 @@ typedef struct pgstattuple_type
 	uint64		free_space;		/* free/reusable space in bytes */
 } pgstattuple_type;
 
-typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
-							 BufferAccessStrategy);
+typedef void (*pgstat_page) (pgstattuple_type *, Relation, Buffer);
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 									FunctionCallInfo fcinfo);
 static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
-							  Relation rel, BlockNumber blkno,
-							  BufferAccessStrategy bstrategy);
+							  Relation rel, Buffer buf);
 static void pgstat_hash_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static void pgstat_gist_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static Datum pgstat_index(Relation rel, BlockNumber start,
 						  pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
@@ -405,13 +401,10 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
  * pgstat_btree_page -- check tuples in a btree page
  */
 static void
-pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				  BufferAccessStrategy bstrategy)
+pgstat_btree_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 
@@ -449,13 +442,15 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_hash_page -- check tuples in a hash page
  */
 static void
-pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_hash_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
+	LockBuffer(buf, HASH_READ);
+
+	/* ref count and lock type are correct */
+
+	_hash_checkpage(rel, buf, 0);
 	page = BufferGetPage(buf);
 
 	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
@@ -491,13 +486,10 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_gist_page -- check tuples in a gist page
  */
 static void
-pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_gist_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, GIST_SHARE);
 	gistcheckpage(rel, buf);
 	page = BufferGetPage(buf);
@@ -526,10 +518,22 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 	BlockNumber blkno;
 	BufferAccessStrategy bstrategy;
 	pgstattuple_type stat = {0};
+	Buffer		buf;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
 
 	/* prepare access strategy for this index */
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
+	p.current_blocknum = start;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = start;
 	for (;;)
 	{
@@ -546,14 +550,21 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 			break;
 		}
 
-		for (; blkno < nblocks; blkno++)
+		while(BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
 		{
 			CHECK_FOR_INTERRUPTS();
 
-			pagefn(&stat, rel, blkno, bstrategy);
+			pagefn(&stat, rel, buf);
 		}
+		/*
+		 * After reaching the end we have to reset stream to use it again.
+		 * Extra restart in case of just one iteration does not cost us much.
+		 */
+		read_stream_reset(stream);
 	}
 
+	read_stream_end(stream);
+
 	relation_close(rel, AccessShareLock);
 
 	return build_pgstattuple_type(&stat, fcinfo);
-- 
2.34.1

Nazir Bilal Yavuz

byavuz81@gmail.com

about 1 year ago

In reply to: Kirill Reshke (#1)

Re: Use streaming read API in pgstattuple.

Hi,

Thank you for working on this!

On Mon, 25 Nov 2024 at 21:17, Kirill Reshke <reshkekirill@gmail.com> wrote:

While reviewing other threads implementing stream API for various core
subsystems, I spotted that pgstattuple could also benefit from that.
So, PFA.

Notice refactoring around pgstat_hash_page and changes in pgstat_page
signature. This is needed because pgstat_hash_tuple uses
_hash_getbuf_with_strategy rather than ReadBufferExtended, which is
OK, but makes adapting the streaming API impossible. So, I changed
this place. Old codes should behave exactly the same way as new ones.
I eliminated the additional check that _hash_getbuf_with_strategy
performed, which was blkno == P_NEW. However, since the streaming API
already delivers the right buffer, I think it's okay.

I agree that this looks okay.

This change behaves sanely on my by-hand testing.

I encountered some problems while playing with your patch. This query
[1]: CREATE EXTENSION pgstattuple; create table test (a int primary key, b int[]); create index test_hashidx on test using hash (b); select pgstattuple(oid) from pg_class where relname = 'test_hashidx';
failing before. I looked at the code and found that:

=== 1

blkno = start;
for (;;)
{
/* Get the current relation length */
LockRelationForExtension(rel, ExclusiveLock);
nblocks = RelationGetNumberOfBlocks(rel);
UnlockRelationForExtension(rel, ExclusiveLock);

I think you need to set p.last_exclusive to nblocks here.

=== 2

-        for (; blkno < nblocks; blkno++)
+        while(BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
         {
             CHECK_FOR_INTERRUPTS();

-            pagefn(&stat, rel, blkno, bstrategy);
+            pagefn(&stat, rel, buf);
         }

blkno doesn't get increased now and this causes an infinite loop.

===

Also, since this problem couldn't be found by the CI, you may want to
add a test for the pgstat_index function.

[1]: CREATE EXTENSION pgstattuple; create table test (a int primary key, b int[]); create index test_hashidx on test using hash (b); select pgstattuple(oid) from pg_class where relname = 'test_hashidx';
key, b int[]); create index test_hashidx on test using hash (b);
select pgstattuple(oid) from pg_class where relname = 'test_hashidx';

--
Regards,
Nazir Bilal Yavuz
Microsoft

Kirill Reshke

reshkekirill@gmail.com

about 1 year ago

In reply to: Nazir Bilal Yavuz (#2)

1 attachment(s)

Re: Use streaming read API in pgstattuple.

On Tue, 26 Nov 2024 at 15:39, Nazir Bilal Yavuz <byavuz81@gmail.com> wrote:

Hi,

Thank you for working on this!

On Mon, 25 Nov 2024 at 21:17, Kirill Reshke <reshkekirill@gmail.com> wrote:

While reviewing other threads implementing stream API for various core
subsystems, I spotted that pgstattuple could also benefit from that.
So, PFA.

Notice refactoring around pgstat_hash_page and changes in pgstat_page
signature. This is needed because pgstat_hash_tuple uses
_hash_getbuf_with_strategy rather than ReadBufferExtended, which is
OK, but makes adapting the streaming API impossible. So, I changed
this place. Old codes should behave exactly the same way as new ones.
I eliminated the additional check that _hash_getbuf_with_strategy
performed, which was blkno == P_NEW. However, since the streaming API
already delivers the right buffer, I think it's okay.

I agree that this looks okay.

This change behaves sanely on my by-hand testing.

I encountered some problems while playing with your patch. This query
[1] fails when the patch is applied although CI passes, it wasn't
failing before. I looked at the code and found that:

=== 1

blkno = start;
for (;;)
{
/* Get the current relation length */
LockRelationForExtension(rel, ExclusiveLock);
nblocks = RelationGetNumberOfBlocks(rel);
UnlockRelationForExtension(rel, ExclusiveLock);

I think you need to set p.last_exclusive to nblocks here.

=== 2
-        for (; blkno < nblocks; blkno++)
+        while(BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
{
CHECK_FOR_INTERRUPTS();
-            pagefn(&stat, rel, blkno, bstrategy);
+            pagefn(&stat, rel, buf);
}
blkno doesn't get increased now and this causes an infinite loop.

===

Also, since this problem couldn't be found by the CI, you may want to
add a test for the pgstat_index function.

[1] CREATE EXTENSION pgstattuple; create table test (a int primary
key, b int[]); create index test_hashidx on test using hash (b);
select pgstattuple(oid) from pg_class where relname = 'test_hashidx';

--
Regards,
Nazir Bilal Yavuz
Microsoft

Hello! Thank you for taking a peek. Your review comments have been
corrected. Since my changes were wrong, I honestly don't know why this
worked in version 1. By a miracle.

As for CI, i rechecked v1:

```
db2=#
select * from pgstathashindex('test_hashidx');
version | bucket_pages | overflow_pages | bitmap_pages | unused_pages
| live_items | dead_items | free_percent
---------+--------------+----------------+--------------+--------------+------------+------------+--------------
4 | 4 | 0 | 1 | 0
| 0 | 0 | 100
(1 row)

db2=#
select * from pgstattuple('test_hashidx');
ERROR: could not read blocks 6..16 in file "base/16454/16473": read
only 0 of 90112 bytes
```

In v2 this behaves correctly.

Should we add `pgstattuple(...)` after `pgstat*type*index(..)`
everywhere in pgstattuple regression tests?

P.S.
I want to mention something that I forgot in my initial email: I
reused codes from here https://commitfest.postgresql.org/50/5327/ with
some adaptation changes. So, Andrey is the author of this patch too.

--
Best regards,
Kirill Reshke

Attachments:

v2-0001-Use-stream-read-interface-for-pgstattuple-routine.patchapplication/octet-stream; name=v2-0001-Use-stream-read-interface-for-pgstattuple-routine.patchDownload

From 72dbd541e8efdbaa7340df871823da71f3c6e0fa Mon Sep 17 00:00:00 2001
From: reshke kirill <reshke@double.cloud>
Date: Mon, 25 Nov 2024 18:03:44 +0000
Subject: [PATCH v2] Use stream read interface for pgstattuple routines.

Patch implements new streaming read API for pgstattuple contrib
extension.

This is not perfomance speedup patch.
This patch enables this extension to benefit from stream API in the future
without introducing any performance improvements.
---
 contrib/pgstattuple/pgstattuple.c | 64 +++++++++++++++++++------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index 48cb8f59c4f..79b0157b9c0 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -61,22 +61,18 @@ typedef struct pgstattuple_type
 	uint64		free_space;		/* free/reusable space in bytes */
 } pgstattuple_type;
 
-typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
-							 BufferAccessStrategy);
+typedef void (*pgstat_page) (pgstattuple_type *, Relation, Buffer);
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 									FunctionCallInfo fcinfo);
 static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
-							  Relation rel, BlockNumber blkno,
-							  BufferAccessStrategy bstrategy);
+							  Relation rel, Buffer buf);
 static void pgstat_hash_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static void pgstat_gist_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static Datum pgstat_index(Relation rel, BlockNumber start,
 						  pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
@@ -405,13 +401,10 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
  * pgstat_btree_page -- check tuples in a btree page
  */
 static void
-pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				  BufferAccessStrategy bstrategy)
+pgstat_btree_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 
@@ -449,13 +442,15 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_hash_page -- check tuples in a hash page
  */
 static void
-pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_hash_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
+	LockBuffer(buf, HASH_READ);
+
+	/* ref count and lock type are correct */
+
+	_hash_checkpage(rel, buf, 0);
 	page = BufferGetPage(buf);
 
 	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
@@ -491,13 +486,10 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_gist_page -- check tuples in a gist page
  */
 static void
-pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_gist_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, GIST_SHARE);
 	gistcheckpage(rel, buf);
 	page = BufferGetPage(buf);
@@ -523,14 +515,24 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 			 FunctionCallInfo fcinfo)
 {
 	BlockNumber nblocks;
-	BlockNumber blkno;
 	BufferAccessStrategy bstrategy;
 	pgstattuple_type stat = {0};
+	Buffer		buf;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
 
 	/* prepare access strategy for this index */
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
-	blkno = start;
+	p.current_blocknum = start;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -539,21 +541,33 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 		UnlockRelationForExtension(rel, ExclusiveLock);
 
 		/* Quit if we've scanned the whole relation */
-		if (blkno >= nblocks)
+		if (p.current_blocknum >= nblocks)
 		{
 			stat.table_len = (uint64) nblocks * BLCKSZ;
 
 			break;
 		}
 
-		for (; blkno < nblocks; blkno++)
+		p.last_exclusive = nblocks;
+
+		while (BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
 		{
 			CHECK_FOR_INTERRUPTS();
 
-			pagefn(&stat, rel, blkno, bstrategy);
+			pagefn(&stat, rel, buf);
 		}
+
+		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+
+		/*
+		 * After reaching the end we have to reset stream to use it again.
+		 * Extra restart in case of just one iteration does not cost us much.
+		 */
+		read_stream_reset(stream);
 	}
 
+	read_stream_end(stream);
+
 	relation_close(rel, AccessShareLock);
 
 	return build_pgstattuple_type(&stat, fcinfo);
-- 
2.34.1

Nazir Bilal Yavuz

byavuz81@gmail.com

about 1 year ago

In reply to: Kirill Reshke (#3)

Re: Use streaming read API in pgstattuple.

Hi,

Hello! Thank you for taking a peek. Your review comments have been
corrected. Since my changes were wrong, I honestly don't know why this
worked in version 1. By a miracle.

As for CI, i rechecked v1:

```
db2=#
select * from pgstathashindex('test_hashidx');
version | bucket_pages | overflow_pages | bitmap_pages | unused_pages
| live_items | dead_items | free_percent
---------+--------------+----------------+--------------+--------------+------------+------------+--------------
4 | 4 | 0 | 1 | 0
| 0 | 0 | 100
(1 row)

db2=#
select * from pgstattuple('test_hashidx');
ERROR: could not read blocks 6..16 in file "base/16454/16473": read
only 0 of 90112 bytes
```

In v2 this behaves correctly.

Yes, I confirm that it works.

Should we add `pgstattuple(...)` after `pgstat*type*index(..)`
everywhere in pgstattuple regression tests?

Not everywhere but at least one pgstattuple(...) call for each index type.

There is one behavior change, it may not be important but I think it
is worth mentioning:

-        for (; blkno < nblocks; blkno++)
+        p.last_exclusive = nblocks;
+
+        while (BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
         {
             CHECK_FOR_INTERRUPTS();

-            pagefn(&stat, rel, blkno, bstrategy);
+            pagefn(&stat, rel, buf);
         }
+
+        Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);

With this change we assume that if stream returns an invalid buffer
that means stream must be finished. We don't check if the stream
didn't finish but somehow read an invalid buffer. In the upstream
version, if we read an invalid buffer then postgres server will fail.
But in the patched version, the server will continue to run because it
will think that stream has reached to end. This could be happening for
other streaming read users; for example at least the
read_stream_next_buffer() calls in the collect_corrupt_items()
function face the same issue.

--
Regards,
Nazir Bilal Yavuz
Microsoft

Matheus Alcantara

matheusssilv97@gmail.com

about 1 year ago

In reply to: Nazir Bilal Yavuz (#4)

Re: Use streaming read API in pgstattuple.

Hi,

On 29/11/24 04:28, Nazir Bilal Yavuz wrote:

-        for (; blkno < nblocks; blkno++)
+        p.last_exclusive = nblocks;
+
+        while (BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
{
CHECK_FOR_INTERRUPTS();
-            pagefn(&stat, rel, blkno, bstrategy);
+            pagefn(&stat, rel, buf);
}
+
+        Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
With this change we assume that if stream returns an invalid buffer
that means stream must be finished. We don't check if the stream
didn't finish but somehow read an invalid buffer. In the upstream
version, if we read an invalid buffer then postgres server will fail.
But in the patched version, the server will continue to run because it
will think that stream has reached to end. This could be happening for
other streaming read users; for example at least the
read_stream_next_buffer() calls in the collect_corrupt_items()
function face the same issue.

Just for reference; On pg_prewarm() for example this loop is
implemented as:

for (block = first_block; block <= last_block; ++block)
{
Buffer buf;
...
buf = read_stream_next_buffer(stream, NULL);
...
}
Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);

Would this approach make sense on these cases? (this patch and on
collect_corrupt_items)

--
Matheus Alcantara

Nazir Bilal Yavuz

byavuz81@gmail.com

about 1 year ago

In reply to: Matheus Alcantara (#5)

Re: Use streaming read API in pgstattuple.

Hi,

On Fri, 29 Nov 2024 at 20:05, Matheus Alcantara
<matheusssilv97@gmail.com> wrote:

Hi,

On 29/11/24 04:28, Nazir Bilal Yavuz wrote:
-        for (; blkno < nblocks; blkno++)
+        p.last_exclusive = nblocks;
+
+        while (BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
{
CHECK_FOR_INTERRUPTS();
-            pagefn(&stat, rel, blkno, bstrategy);
+            pagefn(&stat, rel, buf);
}
+
+        Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
With this change we assume that if stream returns an invalid buffer
that means stream must be finished. We don't check if the stream
didn't finish but somehow read an invalid buffer. In the upstream
version, if we read an invalid buffer then postgres server will fail.
But in the patched version, the server will continue to run because it
will think that stream has reached to end. This could be happening for
other streaming read users; for example at least the
read_stream_next_buffer() calls in the collect_corrupt_items()
function face the same issue.
Just for reference; On pg_prewarm() for example this loop is
implemented as:

for (block = first_block; block <= last_block; ++block)
{
Buffer buf;
...
buf = read_stream_next_buffer(stream, NULL);
...
}
Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);

Would this approach make sense on these cases? (this patch and on
collect_corrupt_items)

Yes, that should work and I guess it would be easy to apply this
approach to this patch but it would be hard to apply this to
collect_corrupt_items().

In cases like pg_prewarm or the current scenario, we know the exact
number of blocks to read, which allows us to determine when the stream
should finish (e.g., after the loop runs blockno times) and return an
invalid buffer. But in collect_corrupt_items(), the number of loop
iterations required is not directly tied to blockno since some blocks
may be skipped. This makes it difficult to know when the stream should
terminate and return an invalid buffer.

--
Regards,
Nazir Bilal Yavuz
Microsoft

Kirill Reshke

reshkekirill@gmail.com

about 1 year ago

In reply to: Matheus Alcantara (#5)

1 attachment(s)

Re: Use streaming read API in pgstattuple.

On Fri, 29 Nov 2024 at 22:05, Matheus Alcantara
<matheusssilv97@gmail.com> wrote:

Just for reference; On pg_prewarm() for example this loop is
implemented as:

for (block = first_block; block <= last_block; ++block)
{
Buffer buf;
...
buf = read_stream_next_buffer(stream, NULL);
...
}
Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);

Would this approach make sense on these cases? (this patch and on
collect_corrupt_items)

Hi!
Thank you for your input. I applied this change for this patch.

I did also add more tests to the pgstattuple regression test, as Nazir
suggested. It turns out that pgstattuple works on GIST indexes, but
not GIN, while `pgstatginindex` exists and `pgstatgistindex` does not.
It took me a while to process the reasons. But those reasons worth
another thread & patch, so no changes here.

--
Best regards,
Kirill Reshke

Attachments:

v3-0001-Use-stream-read-interface-for-pgstattuple-routine.patchapplication/octet-stream; name=v3-0001-Use-stream-read-interface-for-pgstattuple-routine.patchDownload

From c64d25854f09301d0546748983216af6d7fb2c93 Mon Sep 17 00:00:00 2001
From: reshke kirill <reshke@double.cloud>
Date: Mon, 25 Nov 2024 18:03:44 +0000
Subject: [PATCH v3] Use stream read interface for pgstattuple routines.

Patch implements new streaming read API for pgstattuple contrib
extension.

This is not perfomance speedup patch.
This patch enables this extension to benefit from stream API in the future
without introducing any performance improvements.

Author: Andrey Borodin <amborodin@acm.org>
Author: Kirill Reshke <reshkekirill@gmail.com>
Reviewed-By: Matheus Alcantara <matheusssilv97@gmail.com>
Reviewed-By: Nazir Bilal Yavuz <byavuz81@gmail.com>
---
 contrib/pgstattuple/expected/pgstattuple.out | 21 +++++-
 contrib/pgstattuple/pgstattuple.c            | 68 ++++++++++++--------
 contrib/pgstattuple/sql/pgstattuple.sql      |  8 ++-
 3 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out
index 9176dc98b6a..79c792bd402 100644
--- a/contrib/pgstattuple/expected/pgstattuple.out
+++ b/contrib/pgstattuple/expected/pgstattuple.out
@@ -4,7 +4,7 @@ CREATE EXTENSION pgstattuple;
 -- the pgstattuple functions, but the results for empty tables and
 -- indexes should be that.
 --
-create table test (a int primary key, b int[]);
+create table test (a int primary key, b int[], c point);
 select * from pgstattuple('test');
  table_len | tuple_count | tuple_len | tuple_percent | dead_tuple_count | dead_tuple_len | dead_tuple_percent | free_space | free_percent 
 -----------+-------------+-----------+---------------+------------------+----------------+--------------------+------------+--------------
@@ -137,6 +137,7 @@ select * from pgstathashindex('test_hashidx');
        4 |            4 |              0 |            1 |            0 |          0 |          0 |          100
 (1 row)
 
+create index test_gistidx ON test USING gist(c);
 -- these should error with the wrong type
 select pgstatginindex('test_pkey');
 ERROR:  relation "test_pkey" is not a GIN index
@@ -150,6 +151,24 @@ select pgstatindex('test_hashidx');
 ERROR:  relation "test_hashidx" is not a btree index
 select pgstatginindex('test_hashidx');
 ERROR:  relation "test_hashidx" is not a GIN index
+select pgstattuple('test_pkey');
+      pgstattuple       
+------------------------
+ (8192,0,0,0,0,0,0,0,0)
+(1 row)
+
+select pgstattuple('test_gistidx');
+      pgstattuple       
+------------------------
+ (8192,0,0,0,0,0,0,0,0)
+(1 row)
+
+select pgstattuple('test_hashidx');
+           pgstattuple           
+---------------------------------
+ (49152,0,0,0,0,0,0,32608,66.34)
+(1 row)
+
 -- check that using any of these functions with unsupported relations will fail
 create table test_partitioned (a int) partition by range (a);
 create index test_partitioned_index on test_partitioned(a);
diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index 48cb8f59c4f..fecc1c8f031 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -61,22 +61,18 @@ typedef struct pgstattuple_type
 	uint64		free_space;		/* free/reusable space in bytes */
 } pgstattuple_type;
 
-typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
-							 BufferAccessStrategy);
+typedef void (*pgstat_page) (pgstattuple_type *, Relation, Buffer);
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
 									FunctionCallInfo fcinfo);
 static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
-							  Relation rel, BlockNumber blkno,
-							  BufferAccessStrategy bstrategy);
+							  Relation rel, Buffer buf);
 static void pgstat_hash_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static void pgstat_gist_page(pgstattuple_type *stat,
-							 Relation rel, BlockNumber blkno,
-							 BufferAccessStrategy bstrategy);
+							 Relation rel, Buffer buf);
 static Datum pgstat_index(Relation rel, BlockNumber start,
 						  pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
@@ -405,13 +401,10 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
  * pgstat_btree_page -- check tuples in a btree page
  */
 static void
-pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				  BufferAccessStrategy bstrategy)
+pgstat_btree_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 
@@ -449,13 +442,15 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_hash_page -- check tuples in a hash page
  */
 static void
-pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_hash_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
+	LockBuffer(buf, HASH_READ);
+
+	/* ref count and lock type are correct */
+
+	_hash_checkpage(rel, buf, 0);
 	page = BufferGetPage(buf);
 
 	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
@@ -491,13 +486,10 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  * pgstat_gist_page -- check tuples in a gist page
  */
 static void
-pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-				 BufferAccessStrategy bstrategy)
+pgstat_gist_page(pgstattuple_type *stat, Relation rel, Buffer buf)
 {
-	Buffer		buf;
 	Page		page;
 
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, GIST_SHARE);
 	gistcheckpage(rel, buf);
 	page = BufferGetPage(buf);
@@ -523,14 +515,25 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 			 FunctionCallInfo fcinfo)
 {
 	BlockNumber nblocks;
-	BlockNumber blkno;
 	BufferAccessStrategy bstrategy;
 	pgstattuple_type stat = {0};
+	Buffer		buf;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
+	int64		block;
 
 	/* prepare access strategy for this index */
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
-	blkno = start;
+	p.current_blocknum = start;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -539,21 +542,34 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
 		UnlockRelationForExtension(rel, ExclusiveLock);
 
 		/* Quit if we've scanned the whole relation */
-		if (blkno >= nblocks)
+		if (p.current_blocknum >= nblocks)
 		{
 			stat.table_len = (uint64) nblocks * BLCKSZ;
 
 			break;
 		}
 
-		for (; blkno < nblocks; blkno++)
+		p.last_exclusive = nblocks;
+
+		for (block = start; block < nblocks; ++block)
 		{
 			CHECK_FOR_INTERRUPTS();
-
-			pagefn(&stat, rel, blkno, bstrategy);
+			buf = read_stream_next_buffer(stream, NULL);
+			Assert(buf != InvalidBuffer);
+			pagefn(&stat, rel, buf);
 		}
+
+		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+
+		/*
+		 * After reaching the end we have to reset stream to use it again.
+		 * Extra restart in case of just one iteration does not cost us much.
+		 */
+		read_stream_reset(stream);
 	}
 
+	read_stream_end(stream);
+
 	relation_close(rel, AccessShareLock);
 
 	return build_pgstattuple_type(&stat, fcinfo);
diff --git a/contrib/pgstattuple/sql/pgstattuple.sql b/contrib/pgstattuple/sql/pgstattuple.sql
index 7e72c567a06..2fa16ebf05f 100644
--- a/contrib/pgstattuple/sql/pgstattuple.sql
+++ b/contrib/pgstattuple/sql/pgstattuple.sql
@@ -6,7 +6,7 @@ CREATE EXTENSION pgstattuple;
 -- indexes should be that.
 --
 
-create table test (a int primary key, b int[]);
+create table test (a int primary key, b int[], c point);
 
 select * from pgstattuple('test');
 select * from pgstattuple('test'::text);
@@ -52,6 +52,8 @@ create index test_hashidx on test using hash (b);
 
 select * from pgstathashindex('test_hashidx');
 
+create index test_gistidx ON test USING gist(c);
+
 -- these should error with the wrong type
 select pgstatginindex('test_pkey');
 select pgstathashindex('test_pkey');
@@ -62,6 +64,10 @@ select pgstathashindex('test_ginidx');
 select pgstatindex('test_hashidx');
 select pgstatginindex('test_hashidx');
 
+select pgstattuple('test_pkey');
+select pgstattuple('test_gistidx');
+select pgstattuple('test_hashidx');
+
 -- check that using any of these functions with unsupported relations will fail
 create table test_partitioned (a int) partition by range (a);
 create index test_partitioned_index on test_partitioned(a);
-- 
2.34.1

Kirill Reshke

reshkekirill@gmail.com

about 1 year ago

In reply to: Kirill Reshke (#7)

Re: Use streaming read API in pgstattuple.

Hm, CI fails[0]https://api.cirrus-ci.com/v1/artifact/task/5037800950595584/testrun/build-32/testrun/pgstattuple/regress/regression.diffs
This is most likely the reason why `select
pgstattuple('test_hashidx');` was omitted in pgstattuple tests: we are
not guaranteed about the number of bucket_pages in the empty index.
Maybe we should populate base relation and check for only non-volatile
fields like live_items and version?

[0]: https://api.cirrus-ci.com/v1/artifact/task/5037800950595584/testrun/build-32/testrun/pgstattuple/regress/regression.diffs

--
Best regards,
Kirill Reshke