From ab110f371c3fd690b193c543da6afad96988c70a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2025 11:44:35 +1300
Subject: [PATCH v3 3/3] Increase io_combine_limit maximum to 1MB.

The default value of 128kB is not changed, but the arbitrarily chosen
upper limit is changed from 256kB to 1MB (assuming default block size,
and assuming the system IOV_MAX doesn't limit us to a smaller size).
Whether your system can really do 1MB physical transfers without
splitting them up is another matter, but having the option to experiment
is useful.  Other RDBMSes seem to have limits around that number.

The concrete change is to our definition of PG_IOV_MAX, which in turn
controls MAX_IO_COMBINE_LIMIT.  The latter limits the GUCs used for
buffer pool I/O, but the former also affects a couple of other places
that work with arrays of struct iovec or smaller objects on the stack
and always use the compile-time maximum.  Therefore we still don't want
to use IOV_MAX directly, which is not under our control and likely to be
an unnecessarily high 1024.  128 blocks seems acceptable though.

The only current Unix on our target list that is known to have a low
IOV_MAX is Solaris before 11.4 SRU72.  Our Solaris build farm animals
don't have that update, or even SRU69 for preadv/pwritev, so are
currently testing the fallback code as used on Windows.  (illumos
already made these changes many years ago.)

For Windows, we can't use real scatter/gather yet (though it's possible,
in later work), so we continue to provide an IOV_MAX value of 16 and
emulate preadv()/pwritev() with loops.  It's debatable whether we should
increase that number too: it still benefits from I/O combining sometimes
when buffers happen to be consecutive in memory.  Someone would need to
research that.

This change also makes it theoretically possible for read_stream.c's
internal cap of INT16_MAX to be hit.  With effective_io_concurrency set
to 1000 and io_combine_limit set to 1MB, you could theoretically want to
pin 128K buffers at once (= 1GB of data), but there is a cap at ~32K
buffers that stems from the choice of data type.  That could be
revisited in future, but in practice several other limits are very
likely to kick in first.

Suggested-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CA%2BhUKG%2B2T9p-%2BzM6Eeou-RAJjTML6eit1qn26f9twznX59qtCA%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 4 ++++
 src/backend/storage/aio/read_stream.c         | 7 ++++---
 src/backend/utils/misc/postgresql.conf.sample | 4 ++--
 src/include/port/pg_iovec.h                   | 8 ++++++--
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 68ba7cc7980..a668ff08ab9 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2644,6 +2644,8 @@ include_dir 'conf.d'
          This parameter can only be set in
          the <filename>postgresql.conf</filename> file or on the server
          command line.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
@@ -2661,6 +2663,8 @@ include_dir 'conf.d'
          higher than the <varname>max_io_combine_limit</varname> parameter, the
          smaller value will silently be used instead, so both may need to be raised
          to increase the I/O size.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 36fb9fe152c..6939fb8ccf9 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -463,9 +463,10 @@ read_stream_begin_impl(int flags,
 	 * finishes we don't want to have to wait for its buffers to be consumed
 	 * before starting a new one.
 	 *
-	 * Be careful not to allow int16 to overflow (even though that's not
-	 * possible with the current GUC range limits), allowing also for the
-	 * spare entry and the overflow space.
+	 * Be careful not to allow int16 to overflow.  That is possible with the
+	 * current GUC range limits, so this is an artificial limit of ~32k
+	 * buffers and we'd need to adjust the types to exceed that.  We also have
+	 * to allow for the spare entry and the overflow space.
 	 */
 	max_pinned_buffers = (max_ios + 1) * io_combine_limit;
 	max_pinned_buffers = Min(max_pinned_buffers,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 1d63f9e5f36..1859c040b42 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,9 +200,9 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 1		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
-#max_io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#max_io_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 					# (change requires restart)
-#io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 
 # - Worker Processes -
 
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
index d9891d3805d..df40c7208be 100644
--- a/src/include/port/pg_iovec.h
+++ b/src/include/port/pg_iovec.h
@@ -33,8 +33,12 @@ struct iovec
 
 #endif
 
-/* Define a reasonable maximum that is safe to use on the stack. */
-#define PG_IOV_MAX Min(IOV_MAX, 32)
+/*
+ * Define a reasonable maximum that is safe to use on the stack in arrays of
+ * struct iovec and other small types.  The operating system could limit us to
+ * a number as low as 16, but most systems have 1024.
+ */
+#define PG_IOV_MAX Min(IOV_MAX, 128)
 
 /*
  * Like preadv(), but with a prefix to remind us of a side-effect: on Windows
-- 
2.48.1

