From 4abac6442061744b791031a25a31cd45563cc111 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2025 11:44:35 +1300
Subject: [PATCH v4 2/3] Increase the maximum I/O combine size to 1MB.

The default value of 128kB is not changed, but the upper limit is
changed from 32 blocks to 128 blocks (1MB with 8kB blocks), assuming the
operating system's IOV_MAX doesn't limit us to a smaller size.  This is
around where some other RDBMSes seem to cap their buffer pool I/O size,
and it seems like to good idea to allow experiments with that.

The concrete change is to our definition of PG_IOV_MAX, which provides
the maximum limit for io_combine_limit and io_max_combine_limit.  It
also affects a couple of other places that work with arrays of struct
iovec or smaller objects on the stack, so we still don't want to use the
system IOV_MAX directly without a clamp: it is not under our control and
likely to be 1024.  128 seems acceptable for all our current use cases.

The last Unix on our target list known to have a low IOV_MAX was Solaris
before 11.4 SRU72 (it was 16, the minimum requirement for POSIX
conformance, but is now 1024, matching all other systems I looked at).

For Windows, we can't use real scatter/gather yet (though it's possible,
in later work), so we continue to define our own IOV_MAX value of 16 and
emulate preadv()/pwritev() with loops there.  Someone would need to
research the trade-off.

This change also makes it possible for read_stream.c's internal cap of
INT16_MAX to be hit, so adjust comments about that.  With
*_io_concurrent and io_combine_limit set to their maximum, it would want
to be able to pin 128K buffers at once (= 1GB of data), but the choice
of data type limits streams to 32K buffers.  That could be revisited in
future, but you'll probably hit other limits long before that one in
your quest to run 1,000 concurrent I/Os of size 1MB.

Suggested-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CA%2BhUKG%2B2T9p-%2BzM6Eeou-RAJjTML6eit1qn26f9twznX59qtCA%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 4 ++++
 src/backend/storage/aio/read_stream.c         | 7 ++++---
 src/backend/utils/misc/postgresql.conf.sample | 4 ++--
 src/include/port/pg_iovec.h                   | 8 ++++++--
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d1080dac97f..93eea7f96d2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2638,6 +2638,8 @@ include_dir 'conf.d'
          This parameter can only be set in
          the <filename>postgresql.conf</filename> file or on the server
          command line.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
@@ -2655,6 +2657,8 @@ include_dir 'conf.d'
          higher than the <varname>io_max_combine_limit</varname> parameter, the
          smaller value will silently be used instead, so both may need to be raised
          to increase the I/O size.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index d65fa07b44c..45bdf819d57 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -515,9 +515,10 @@ read_stream_begin_impl(int flags,
 	 * finishes we don't want to have to wait for its buffers to be consumed
 	 * before starting a new one.
 	 *
-	 * Be careful not to allow int16 to overflow (even though that's not
-	 * possible with the current GUC range limits), allowing also for the
-	 * spare entry and the overflow space.
+	 * Be careful not to allow int16 to overflow.  That is possible with the
+	 * current GUC range limits, so this is an artificial limit of ~32k
+	 * buffers and we'd need to adjust the types to exceed that.  We also have
+	 * to allow for the spare entry and the overflow space.
 	 */
 	max_pinned_buffers = (max_ios + 1) * io_combine_limit;
 	max_pinned_buffers = Min(max_pinned_buffers,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index bd9a3507135..e43d803b278 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,9 +200,9 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 16		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
-#io_max_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_max_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 					# (change requires restart)
-#io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 
 #io_method = sync			# sync (change requires restart)
 #io_max_concurrency = -1		# Max number of IOs that one process
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
index d9891d3805d..df40c7208be 100644
--- a/src/include/port/pg_iovec.h
+++ b/src/include/port/pg_iovec.h
@@ -33,8 +33,12 @@ struct iovec
 
 #endif
 
-/* Define a reasonable maximum that is safe to use on the stack. */
-#define PG_IOV_MAX Min(IOV_MAX, 32)
+/*
+ * Define a reasonable maximum that is safe to use on the stack in arrays of
+ * struct iovec and other small types.  The operating system could limit us to
+ * a number as low as 16, but most systems have 1024.
+ */
+#define PG_IOV_MAX Min(IOV_MAX, 128)
 
 /*
  * Like preadv(), but with a prefix to remind us of a side-effect: on Windows
-- 
2.39.5

