From 6fc1187d92f2960379818fd0c35d0a995f9e7a67 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 12 Feb 2025 13:52:22 +1300
Subject: [PATCH] Introduce max_io_combine_limit.

The existing io_combine_limit parameter can be set by users.  The new
max_io_combine_limit parameter, set only at server startup time, applies
as a silent limit on top of that.  The effective I/O combine limit is
maintained in a variable effective_io_combine_limit, and has the
minimum of the two values.

This allows the administrator to cap the total I/O size system-wide, but
also provides a way for proposed patches to know what the maximum could
possibly be in cases where it is multiplied by other GUCs to allocate
shared memory.

The read_stream.c code is changed to respect effective_io_combine_limit.
---
 doc/src/sgml/config.sgml                      | 23 ++++++++++++++++++-
 src/backend/commands/variable.c               | 18 +++++++++++++++
 src/backend/storage/aio/read_stream.c         | 20 ++++++++--------
 src/backend/storage/buffer/bufmgr.c           |  5 +++-
 src/backend/utils/misc/guc_tables.c           | 16 ++++++++++++-
 src/backend/utils/misc/postgresql.conf.sample |  2 ++
 src/include/storage/bufmgr.h                  |  2 ++
 src/include/utils/guc_hooks.h                 |  2 ++
 8 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 3b557ecabfb..c6de8b9e236 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2605,6 +2605,24 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
 
+      <varlistentry id="guc-max-io-combine-limit" xreflabel="max_io_combine_limit">
+       <term><varname>max_io_combine_limit</varname> (<type>integer</type>)
+       <indexterm>
+        <primary><varname>max_io_combine_limit</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Controls the largest I/O size in operations that combine I/O, and silently
+         limits the user-settable parameter <varname>io_combine_limit</varname>.
+         This parameter can only be set in
+         the <filename>postgresql.conf</filename> file or on the server
+         command line.
+         The default is 128kB.
+        </para>
+       </listitem>
+      </varlistentry>
+
       <varlistentry id="guc-io-combine-limit" xreflabel="io_combine_limit">
        <term><varname>io_combine_limit</varname> (<type>integer</type>)
        <indexterm>
@@ -2613,7 +2631,10 @@ include_dir 'conf.d'
        </term>
        <listitem>
         <para>
-         Controls the largest I/O size in operations that combine I/O.
+         Controls the largest I/O size in operations that combine I/O.  If set
+         higher than the <varname>max_io_combine_limit</varname> parameter, the
+         smaller value will silently be used instead, so both may need to be raised
+         to increase the I/O size.
          The default is 128kB.
         </para>
        </listitem>
diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c
index 4ad6e236d69..62c88b6491a 100644
--- a/src/backend/commands/variable.c
+++ b/src/backend/commands/variable.c
@@ -1156,6 +1156,24 @@ assign_maintenance_io_concurrency(int newval, void *extra)
 #endif
 }
 
+/*
+ * GUC assign hooks that recompute effective_io_combine_limit whenever
+ * io_combine_limit and max_io_combine_limit are set.  These are needed because
+ * the GUC subsystem doesn't support dependencies between GUCs, and they may be
+ * assigned in either order.
+ */
+void
+assign_max_io_combine_limit(int newval, void *extra)
+{
+	max_io_combine_limit = newval;
+	effective_io_combine_limit = Min(max_io_combine_limit, io_combine_limit);
+}
+void
+assign_io_combine_limit(int newval, void *extra)
+{
+	io_combine_limit = newval;
+	effective_io_combine_limit = Min(max_io_combine_limit, io_combine_limit);
+}
 
 /*
  * These show hooks just exist because we want to show the values in octal.
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index e4414b2e915..b3c17ab9404 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -225,7 +225,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 
 	/* This should only be called with a pending read. */
 	Assert(stream->pending_read_nblocks > 0);
-	Assert(stream->pending_read_nblocks <= io_combine_limit);
+	Assert(stream->pending_read_nblocks <= effective_io_combine_limit);
 
 	/* We had better not exceed the pin limit by starting this read. */
 	Assert(stream->pinned_buffers + stream->pending_read_nblocks <=
@@ -313,7 +313,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		int16		buffer_index;
 		void	   *per_buffer_data;
 
-		if (stream->pending_read_nblocks == io_combine_limit)
+		if (stream->pending_read_nblocks == effective_io_combine_limit)
 		{
 			read_stream_start_pending_read(stream, suppress_advice);
 			suppress_advice = false;
@@ -373,7 +373,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 	 * signaled end-of-stream, we start the read immediately.
 	 */
 	if (stream->pending_read_nblocks > 0 &&
-		(stream->pending_read_nblocks == io_combine_limit ||
+		(stream->pending_read_nblocks == effective_io_combine_limit ||
 		 (stream->pending_read_nblocks == stream->distance &&
 		  stream->pinned_buffers == 0) ||
 		 stream->distance == 0) &&
@@ -442,9 +442,9 @@ read_stream_begin_impl(int flags,
 	 * overflow (even though that's not possible with the current GUC range
 	 * limits), allowing also for the spare entry and the overflow space.
 	 */
-	max_pinned_buffers = Max(max_ios * 4, io_combine_limit);
+	max_pinned_buffers = Max(max_ios * 4, effective_io_combine_limit);
 	max_pinned_buffers = Min(max_pinned_buffers,
-							 PG_INT16_MAX - io_combine_limit - 1);
+							 PG_INT16_MAX - effective_io_combine_limit - 1);
 
 	/* Give the strategy a chance to limit the number of buffers we pin. */
 	strategy_pin_limit = GetAccessStrategyPinLimit(strategy);
@@ -474,14 +474,14 @@ read_stream_begin_impl(int flags,
 	 * io_combine_limit - 1 elements.
 	 */
 	size = offsetof(ReadStream, buffers);
-	size += sizeof(Buffer) * (queue_size + io_combine_limit - 1);
+	size += sizeof(Buffer) * (queue_size + effective_io_combine_limit - 1);
 	size += sizeof(InProgressIO) * Max(1, max_ios);
 	size += per_buffer_data_size * queue_size;
 	size += MAXIMUM_ALIGNOF * 2;
 	stream = (ReadStream *) palloc(size);
 	memset(stream, 0, offsetof(ReadStream, buffers));
 	stream->ios = (InProgressIO *)
-		MAXALIGN(&stream->buffers[queue_size + io_combine_limit - 1]);
+		MAXALIGN(&stream->buffers[queue_size + effective_io_combine_limit - 1]);
 	if (per_buffer_data_size > 0)
 		stream->per_buffer_data = (void *)
 			MAXALIGN(&stream->ios[Max(1, max_ios)]);
@@ -522,7 +522,7 @@ read_stream_begin_impl(int flags,
 	 * doing full io_combine_limit sized reads (behavior B).
 	 */
 	if (flags & READ_STREAM_FULL)
-		stream->distance = Min(max_pinned_buffers, io_combine_limit);
+		stream->distance = Min(max_pinned_buffers, effective_io_combine_limit);
 	else
 		stream->distance = 1;
 
@@ -737,14 +737,14 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No advice; move towards io_combine_limit (behavior B). */
-			if (stream->distance > io_combine_limit)
+			if (stream->distance > effective_io_combine_limit)
 			{
 				stream->distance--;
 			}
 			else
 			{
 				distance = stream->distance * 2;
-				distance = Min(distance, io_combine_limit);
+				distance = Min(distance, effective_io_combine_limit);
 				distance = Min(distance, stream->max_pinned_buffers);
 				stream->distance = distance;
 			}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index ee83669992b..03953ae48c5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -160,8 +160,11 @@ int			maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
 /*
  * Limit on how many blocks should be handled in single I/O operations.
  * StartReadBuffers() callers should respect it, as should other operations
- * that call smgr APIs directly.
+ * that call smgr APIs directly.  It is computed as the minimum of underlying
+ * GUCs io_combine_limit and max_io_combine_limit.
  */
+int			effective_io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
+int			max_io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
 int			io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
 
 /*
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 226af43fe23..8f6966b397f 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3216,6 +3216,20 @@ struct config_int ConfigureNamesInt[] =
 		NULL
 	},
 
+	{
+		{"max_io_combine_limit",
+			PGC_POSTMASTER,
+			RESOURCES_IO,
+			gettext_noop("System-wide limit applied to io_combine_limit."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&max_io_combine_limit,
+		DEFAULT_IO_COMBINE_LIMIT,
+		1, MAX_IO_COMBINE_LIMIT,
+		NULL, assign_max_io_combine_limit, NULL
+	},
+
 	{
 		{"io_combine_limit",
 			PGC_USERSET,
@@ -3227,7 +3241,7 @@ struct config_int ConfigureNamesInt[] =
 		&io_combine_limit,
 		DEFAULT_IO_COMBINE_LIMIT,
 		1, MAX_IO_COMBINE_LIMIT,
-		NULL, NULL, NULL
+		NULL, assign_io_combine_limit, NULL
 	},
 
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d472987ed46..cae4ffbdce7 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -197,6 +197,8 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 1		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
+#max_io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+					# (change requires restart)
 #io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
 
 # - Worker Processes -
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 7c1e4316dde..8aa4a2f8fb6 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -164,6 +164,8 @@ extern PGDLLIMPORT int maintenance_io_concurrency;
 #define MAX_IO_COMBINE_LIMIT PG_IOV_MAX
 #define DEFAULT_IO_COMBINE_LIMIT Min(MAX_IO_COMBINE_LIMIT, (128 * 1024) / BLCKSZ)
 extern PGDLLIMPORT int io_combine_limit;
+extern PGDLLIMPORT int max_io_combine_limit;
+extern PGDLLIMPORT int effective_io_combine_limit;	/* min of above pair */
 
 extern PGDLLIMPORT int checkpoint_flush_after;
 extern PGDLLIMPORT int backend_flush_after;
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 87999218d68..48094641251 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -82,6 +82,8 @@ extern const char *show_log_timezone(void);
 extern bool check_maintenance_io_concurrency(int *newval, void **extra,
 											 GucSource source);
 extern void assign_maintenance_io_concurrency(int newval, void *extra);
+extern void assign_max_io_combine_limit(int newval, void *extra);
+extern void assign_io_combine_limit(int newval, void *extra);
 extern bool check_max_slot_wal_keep_size(int *newval, void **extra,
 										 GucSource source);
 extern void assign_max_wal_size(int newval, void *extra);
-- 
2.48.1

