From 08821a5db5b7e78dd31f1911253d907a391c6b69 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 15 Mar 2026 23:29:15 -0700
Subject: [PATCH vnocfbot 3/3] instrumentation: Track I/O prefetch info and
 show with EXPLAIN (IO)

This adds details about AIO / prefetch for executor nodes using the
ReadStream API, currently SeqScan and BitmapHeapScan, into the new
IOUsage information added to Instrumentation. This can be viewed
through the new EXPLAIN (IO) information, or could be tracked by
other interested callers through the stack-based instrumentation
mechanism.

The ReadStream tracks the statistics unconditionally, i.e. even
outside EXPLAIN ANALYZE etc. The amount of statistics is trivial
(a handful of integer counters), it's not worth gating this by a
flag.

Author: Lukas Fittl <lukas@fittl.com>
Author: Tomas Vondra <tomas@vondra.me>
Reviewed By:
Discussion: https://www.postgresql.org/message-id/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 src/backend/commands/explain.c        | 75 +++++++++++++++++++++++++--
 src/backend/commands/explain_state.c  |  8 +++
 src/backend/executor/execMain.c       |  6 +--
 src/backend/executor/instrument.c     | 54 ++++++++++++++-----
 src/backend/storage/aio/read_stream.c | 33 +++++++++++-
 src/include/commands/explain_state.h  |  1 +
 src/include/executor/instrument.h     | 31 +++++++++++
 src/tools/pgindent/typedefs.list      |  1 +
 8 files changed, 189 insertions(+), 20 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8a641f9d05..6eca0e2051 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -145,6 +145,7 @@ static const char *explain_get_index_name(Oid indexId);
 static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
 static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
+static void show_io_usage(ExplainState *es, const IOUsage *usage);
 static void show_memory_counters(ExplainState *es,
 								 const MemoryContextCounters *mem_counters);
 static void show_result_replacement_info(Result *result, ExplainState *es);
@@ -509,6 +510,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		instrument_option |= INSTRUMENT_BUFFERS;
 	if (es->wal)
 		instrument_option |= INSTRUMENT_WAL;
+	if (es->io)
+		instrument_option |= INSTRUMENT_IO;
 
 	/*
 	 * We always collect timing for the entire statement, even when node-level
@@ -2281,14 +2284,16 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		}
 	}
 
-	/* Show buffer/WAL usage */
+	/* Show buffer/WAL/IO usage */
 	if (es->buffers && planstate->instrument)
 		show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL);
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->instr.walusage);
+	if (es->io && planstate->instrument)
+		show_io_usage(es, &planstate->instrument->instr.iousage);
 
-	/* Prepare per-worker buffer/WAL usage */
-	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
+	/* Prepare per-worker buffer/WAL/IO usage */
+	if (es->workers_state && (es->buffers || es->wal || es->io) && es->verbose)
 	{
 		WorkerNodeInstrumentation *w = planstate->worker_instrument;
 
@@ -2305,6 +2310,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_buffer_usage(es, &instrument->instr.bufusage, NULL);
 			if (es->wal)
 				show_wal_usage(es, &instrument->instr.walusage);
+			if (es->io)
+				show_io_usage(es, &instrument->instr.iousage);
 			ExplainCloseWorker(n, es);
 		}
 	}
@@ -4343,6 +4350,68 @@ show_wal_usage(ExplainState *es, const WalUsage *usage)
 	}
 }
 
+/*
+ * Show I/O prefetch usage details.
+ */
+static void
+show_io_usage(ExplainState *es, const IOUsage *usage)
+{
+	/* Nothing to show if no buffers were returned */
+	if (usage->count <= 0)
+		return;
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		/* prefetch distance info */
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Prefetch: avg=%.3f max=%" PRId64 " capacity=%" PRId64,
+						 (usage->distance_sum * 1.0 / usage->count),
+						 usage->distance_max,
+						 usage->distance_capacity);
+		appendStringInfoChar(es->str, '\n');
+
+		/* prefetch I/O info (only if there were actual I/Os) */
+		if (usage->stall_count > 0 || usage->io_count > 0)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "I/O: stalls=%" PRId64,
+							 usage->stall_count);
+
+			if (usage->io_count > 0)
+			{
+				appendStringInfo(es->str, " size=%.3f inprogress=%.3f",
+								 (usage->io_blocks * 1.0 / usage->io_count),
+								 (usage->ios_in_progress * 1.0 / usage->io_count));
+			}
+
+			appendStringInfoChar(es->str, '\n');
+		}
+	}
+	else
+	{
+		ExplainOpenGroup("Prefetch", "I/O", true, es);
+
+		ExplainPropertyFloat("Average Distance", NULL,
+							 (usage->distance_sum * 1.0 / usage->count), 3, es);
+		ExplainPropertyInteger("Max Distance", NULL,
+							   usage->distance_max, es);
+		ExplainPropertyInteger("Capacity", NULL,
+							   usage->distance_capacity, es);
+		ExplainPropertyInteger("Stalls", NULL,
+							   usage->stall_count, es);
+
+		if (usage->io_count > 0)
+		{
+			ExplainPropertyFloat("Average IO Size", NULL,
+								 (usage->io_blocks * 1.0 / usage->io_count), 3, es);
+			ExplainPropertyFloat("Average IOs In Progress", NULL,
+								 (usage->ios_in_progress * 1.0 / usage->io_count), 3, es);
+		}
+
+		ExplainCloseGroup("Prefetch", "I/O", true, es);
+	}
+}
+
 /*
  * Show memory usage details.
  */
diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
index 77f59b8e50..b5129f4914 100644
--- a/src/backend/commands/explain_state.c
+++ b/src/backend/commands/explain_state.c
@@ -115,6 +115,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 		}
 		else if (strcmp(opt->defname, "memory") == 0)
 			es->memory = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "io") == 0)
+			es->io = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "serialize") == 0)
 		{
 			if (opt->arg)
@@ -185,6 +187,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("EXPLAIN option %s requires ANALYZE", "TIMING")));
 
+	/* check that IO is used with EXPLAIN ANALYZE */
+	if (es->io && !es->analyze)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("EXPLAIN option %s requires ANALYZE", "IO")));
+
 	/* check that serialize is used with EXPLAIN ANALYZE */
 	if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze)
 		ereport(ERROR,
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 82253317e9..de0795c944 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -334,7 +334,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	 * Start up required top-level instrumentation stack for WAL/buffer
 	 * tracking
 	 */
-	if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)))
+	if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL | INSTRUMENT_IO)))
 		queryDesc->totaltime = InstrQueryAlloc(estate->es_instrument);
 
 	if (queryDesc->totaltime)
@@ -347,7 +347,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 		 * after the first call to InstrQueryStart has pushed the parent
 		 * entry.
 		 */
-		if ((estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) &&
+		if ((estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL | INSTRUMENT_IO)) &&
 			!queryDesc->already_executed)
 			ExecRememberNodeInstrumentation(queryDesc->planstate,
 											queryDesc->totaltime);
@@ -1535,7 +1535,7 @@ ExecFinalizeTriggerInstrumentation(EState *estate)
 	{
 		TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
 
-		if (ti && (ti->instr.need_bufusage || ti->instr.need_walusage))
+		if (ti && (ti->instr.need_bufusage || ti->instr.need_walusage || ti->instr.need_iousage))
 			InstrAccum(instr_stack.current, &ti->instr);
 	}
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 61297c5653..c8f182e7c5 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -46,6 +46,7 @@ InstrInitOptions(Instrumentation *instr, int instrument_options)
 {
 	instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
 	instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+	instr->need_iousage = (instrument_options & INSTRUMENT_IO) != 0;
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
@@ -76,7 +77,7 @@ InstrStart(Instrumentation *instr)
 	if (instr->need_timer)
 		InstrStartTimer(instr);
 
-	if (instr->need_bufusage || instr->need_walusage)
+	if (instr->need_bufusage || instr->need_walusage || instr->need_iousage)
 		InstrPushStack(instr);
 }
 
@@ -86,7 +87,7 @@ InstrStop(Instrumentation *instr)
 	if (instr->need_timer)
 		InstrStopTimer(instr);
 
-	if (instr->need_bufusage || instr->need_walusage)
+	if (instr->need_bufusage || instr->need_walusage || instr->need_iousage)
 		InstrPopStack(instr);
 }
 
@@ -197,7 +198,9 @@ InstrQueryAlloc(int instrument_options)
 	 * survives transaction abort — ResourceOwner release needs to access
 	 * it.
 	 */
-	if ((instrument_options & INSTRUMENT_BUFFERS) != 0 || (instrument_options & INSTRUMENT_WAL) != 0)
+	if ((instrument_options & INSTRUMENT_BUFFERS) != 0 ||
+		(instrument_options & INSTRUMENT_WAL) != 0 ||
+		(instrument_options & INSTRUMENT_IO) != 0)
 		instr = MemoryContextAllocZero(TopMemoryContext, sizeof(QueryInstrumentation));
 	else
 		instr = palloc0(sizeof(QueryInstrumentation));
@@ -213,7 +216,7 @@ InstrQueryStart(QueryInstrumentation *qinstr)
 {
 	InstrStart(&qinstr->instr);
 
-	if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage)
+	if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage || qinstr->instr.need_iousage)
 	{
 		Assert(CurrentResourceOwner != NULL);
 		qinstr->owner = CurrentResourceOwner;
@@ -228,7 +231,7 @@ InstrQueryStop(QueryInstrumentation *qinstr)
 {
 	InstrStop(&qinstr->instr);
 
-	if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage)
+	if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage || qinstr->instr.need_iousage)
 	{
 		Assert(qinstr->owner != NULL);
 		ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
@@ -243,7 +246,7 @@ InstrQueryStopFinalize(QueryInstrumentation *qinstr)
 
 	InstrStopFinalize(&qinstr->instr);
 
-	if (!qinstr->instr.need_bufusage && !qinstr->instr.need_walusage)
+	if (!qinstr->instr.need_bufusage && !qinstr->instr.need_walusage && !qinstr->instr.need_iousage)
 		return qinstr;
 
 	Assert(qinstr->owner != NULL);
@@ -270,7 +273,7 @@ InstrQueryStopFinalize(QueryInstrumentation *qinstr)
 void
 InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *child)
 {
-	if (child->instr.need_bufusage || child->instr.need_walusage)
+	if (child->instr.need_bufusage || child->instr.need_walusage || child->instr.need_iousage)
 		dlist_push_head(&parent->unfinalized_children, &child->unfinalized_node);
 }
 
@@ -278,7 +281,7 @@ InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *child)
 QueryInstrumentation *
 InstrStartParallelQuery(void)
 {
-	QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+	QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL | INSTRUMENT_IO);
 
 	InstrQueryStart(qinstr);
 	return qinstr;
@@ -291,6 +294,7 @@ InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst)
 	qinstr = InstrQueryStopFinalize(qinstr);
 	memcpy(&dst->bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
 	memcpy(&dst->walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+	memcpy(&dst->iousage, &qinstr->instr.iousage, sizeof(IOUsage));
 }
 
 /*
@@ -310,6 +314,7 @@ InstrAccumParallelQuery(Instrumentation *instr)
 {
 	BufferUsageAdd(&instr_stack.current->bufusage, &instr->bufusage);
 	WalUsageAdd(&instr_stack.current->walusage, &instr->walusage);
+	IOUsageAdd(&instr_stack.current->iousage, &instr->iousage);
 
 	WalUsageAdd(&pgWalUsage, &instr->walusage);
 }
@@ -329,7 +334,9 @@ InstrAllocNode(int instrument_options, bool async_mode)
 	 * utility commands that restart transactions, which would require a
 	 * context that survives longer (EXPLAIN ANALYZE is fine).
 	 */
-	if ((instrument_options & INSTRUMENT_BUFFERS) != 0 || (instrument_options & INSTRUMENT_WAL) != 0)
+	if ((instrument_options & INSTRUMENT_BUFFERS) != 0 ||
+		(instrument_options & INSTRUMENT_WAL) != 0 ||
+		(instrument_options & INSTRUMENT_IO) != 0)
 		instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation));
 	else
 		instr = palloc(sizeof(NodeInstrumentation));
@@ -392,7 +399,7 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples)
 		InstrStopNodeTimer(instr);
 
 	/* Only pop the stack, accumulation runs in InstrFinalizeNode */
-	if (instr->instr.need_bufusage || instr->instr.need_walusage)
+	if (instr->instr.need_bufusage || instr->instr.need_walusage || instr->instr.need_iousage)
 		InstrPopStack(&instr->instr);
 
 	instr->running = true;
@@ -407,7 +414,7 @@ InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent)
 	NodeInstrumentation *dst;
 
 	/* If we didn't use stack based instrumentation, nothing to be done */
-	if (!instr->instr.need_bufusage && !instr->instr.need_walusage)
+	if (!instr->instr.need_bufusage && !instr->instr.need_walusage && !instr->instr.need_iousage)
 		return instr;
 
 	/* Copy into per-query memory context */
@@ -418,7 +425,7 @@ InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent)
 	InstrAccum(parent, &dst->instr);
 
 	/* Unregister from query's unfinalized list before freeing */
-	if (instr->instr.need_bufusage || instr->instr.need_walusage)
+	if (instr->instr.need_bufusage || instr->instr.need_walusage || instr->instr.need_iousage)
 		dlist_delete(&instr->unfinalized_node);
 
 	pfree(instr);
@@ -489,6 +496,9 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
 
 	if (dst->instr.need_walusage)
 		WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
+
+	if (dst->instr.need_iousage)
+		IOUsageAdd(&dst->instr.iousage, &add->instr.iousage);
 }
 
 /*
@@ -598,7 +608,8 @@ InstrNodeSetupExecProcNode(NodeInstrumentation *instr)
 {
 	bool		need_timer = instr->instr.need_timer;
 	bool		need_buf = (instr->instr.need_bufusage ||
-							instr->instr.need_walusage);
+							instr->instr.need_walusage ||
+							instr->instr.need_iousage);
 
 	if (need_timer && need_buf)
 		return ExecProcNodeInstrFull;
@@ -649,6 +660,7 @@ InstrAccum(Instrumentation *dst, Instrumentation *add)
 
 	BufferUsageAdd(&dst->bufusage, &add->bufusage);
 	WalUsageAdd(&dst->walusage, &add->walusage);
+	IOUsageAdd(&dst->iousage, &add->iousage);
 }
 
 /* dst += add */
@@ -684,6 +696,22 @@ WalUsageAdd(WalUsage *dst, const WalUsage *add)
 	dst->wal_buffers_full += add->wal_buffers_full;
 }
 
+/* dst += add (using max semantics for distance_max and distance_capacity) */
+void
+IOUsageAdd(IOUsage *dst, const IOUsage *add)
+{
+	dst->count += add->count;
+	dst->distance_sum += add->distance_sum;
+	if (add->distance_max > dst->distance_max)
+		dst->distance_max = add->distance_max;
+	if (add->distance_capacity > dst->distance_capacity)
+		dst->distance_capacity = add->distance_capacity;
+	dst->stall_count += add->stall_count;
+	dst->io_count += add->io_count;
+	dst->io_blocks += add->io_blocks;
+	dst->ios_in_progress += add->ios_in_progress;
+}
+
 void
 WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
 {
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index cd54c1a74a..6d2285beb1 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -71,6 +71,7 @@
  */
 #include "postgres.h"
 
+#include "executor/instrument.h"
 #include "miscadmin.h"
 #include "storage/aio.h"
 #include "storage/fd.h"
@@ -172,6 +173,19 @@ block_range_read_stream_cb(ReadStream *stream,
 	return InvalidBlockNumber;
 }
 
+/*
+ * Update IO instrumentation when returning a buffer to the consumer.
+ * Records the current look-ahead depth for averaging.
+ */
+static inline void
+read_stream_instr_update(ReadStream *stream)
+{
+	INSTR_IOUSAGE_INCR(count);
+	INSTR_IOUSAGE_ADD(distance_sum, stream->pinned_buffers);
+	INSTR_IOUSAGE_MAX(distance_max, stream->pinned_buffers);
+	INSTR_IOUSAGE_MAX(distance_capacity, stream->max_pinned_buffers);
+}
+
 /*
  * Ask the callback which block it would like us to read next, with a one block
  * buffer in front to allow read_stream_unget_block() to work.
@@ -380,6 +394,11 @@ read_stream_start_pending_read(ReadStream *stream)
 		Assert(stream->ios_in_progress < stream->max_ios);
 		stream->ios_in_progress++;
 		stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
+
+		/* Update I/O stats */
+		INSTR_IOUSAGE_INCR(io_count);
+		INSTR_IOUSAGE_ADD(io_blocks, nblocks);
+		INSTR_IOUSAGE_ADD(ios_in_progress, stream->ios_in_progress);
 	}
 
 	/*
@@ -851,6 +870,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 										flags)))
 			{
 				/* Fast return. */
+				read_stream_instr_update(stream);
 				return buffer;
 			}
 
@@ -860,6 +880,9 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			stream->ios_in_progress = 1;
 			stream->ios[0].buffer_index = oldest_buffer_index;
 			stream->seq_blocknum = next_blocknum + 1;
+
+			/* Since we executed IO synchronously, count it as a stall */
+			INSTR_IOUSAGE_INCR(stall_count);
 		}
 		else
 		{
@@ -871,6 +894,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		}
 
 		stream->fast_path = false;
+		read_stream_instr_update(stream);
 		return buffer;
 	}
 #endif
@@ -916,12 +940,17 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	{
 		int16		io_index = stream->oldest_io_index;
 		int32		distance;	/* wider temporary value, clamped below */
+		bool		needed_wait;
 
 		/* Sanity check that we still agree on the buffers. */
 		Assert(stream->ios[io_index].op.buffers ==
 			   &stream->buffers[oldest_buffer_index]);
 
-		WaitReadBuffers(&stream->ios[io_index].op);
+		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
+
+		/* Count it as a stall if we needed to wait for I/O */
+		if (needed_wait)
+			INSTR_IOUSAGE_INCR(stall_count);
 
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
@@ -981,6 +1010,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	}
 #endif
 
+	read_stream_instr_update(stream);
+
 	/* Pin transferred to caller. */
 	Assert(stream->pinned_buffers > 0);
 	stream->pinned_buffers--;
diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
index 0b695f7d81..801e422437 100644
--- a/src/include/commands/explain_state.h
+++ b/src/include/commands/explain_state.h
@@ -54,6 +54,7 @@ typedef struct ExplainState
 	bool		summary;		/* print total planning and execution timing */
 	bool		memory;			/* print planner's memory usage information */
 	bool		settings;		/* print modified settings */
+	bool		io;				/* print info about IO (prefetch, ...) */
 	bool		generic;		/* generate a generic plan */
 	ExplainSerializeOption serialize;	/* serialize the query's output? */
 	ExplainFormat format;		/* output format */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index e4fc9e7870..0b7a84fc55 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -58,6 +58,22 @@ typedef struct WalUsage
 	int64		wal_buffers_full;	/* # of times the WAL buffers became full */
 } WalUsage;
 
+/*
+ * IOUsage tracks I/O prefetch activity that can be measured per executor
+ * node and displayed by EXPLAIN (ANALYZE, IO).
+ */
+typedef struct IOUsage
+{
+	int64		count;			/* # of buffers returned */
+	int64		distance_sum;	/* sum of look-ahead distances */
+	int64		distance_max;	/* max look-ahead distance observed */
+	int64		distance_capacity;	/* max possible look-ahead */
+	int64		stall_count;	/* # of I/O stalls */
+	int64		io_count;		/* # of I/O operations */
+	int64		io_blocks;		/* total blocks across I/Os */
+	int64		ios_in_progress;	/* sum of in-progress I/Os */
+} IOUsage;
+
 /* Flag bits included in InstrAlloc's instrument_options bitmask */
 typedef enum InstrumentOption
 {
@@ -65,6 +81,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
+	INSTRUMENT_IO = 1 << 4,		/* needs I/O prefetch usage */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
@@ -92,12 +109,14 @@ typedef struct Instrumentation
 	bool		need_timer;		/* true if we need timer data */
 	bool		need_bufusage;	/* true if we need buffer usage data */
 	bool		need_walusage;	/* true if we need WAL usage data */
+	bool		need_iousage;	/* true if we need I/O prefetch data */
 	/* Internal state keeping: */
 	instr_time	starttime;		/* start time of last InstrStart */
 	/* Accumulated statistics: */
 	instr_time	total;			/* total runtime */
 	BufferUsage bufusage;		/* total buffer usage */
 	WalUsage	walusage;		/* total WAL usage */
+	IOUsage		iousage;		/* total I/O prefetch usage */
 } Instrumentation;
 
 /*
@@ -301,6 +320,7 @@ extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
 extern void WalUsageAdd(WalUsage *dst, const WalUsage *add);
 extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 							  const WalUsage *sub);
+extern void IOUsageAdd(IOUsage *dst, const IOUsage *add);
 
 #define INSTR_BUFUSAGE_INCR(fld) do { \
 		instr_stack.current->bufusage.fld++; \
@@ -324,4 +344,15 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 		instr_stack.current->walusage.fld += val; \
 	} while(0)
 
+#define INSTR_IOUSAGE_INCR(fld) do { \
+		instr_stack.current->iousage.fld++; \
+	} while(0)
+#define INSTR_IOUSAGE_ADD(fld,val) do { \
+		instr_stack.current->iousage.fld += val; \
+	} while(0)
+#define INSTR_IOUSAGE_MAX(fld,val) do { \
+		if ((val) > instr_stack.current->iousage.fld) \
+			instr_stack.current->iousage.fld = (val); \
+	} while(0)
+
 #endif							/* INSTRUMENT_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 1c9be944c5..985662a252 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1336,6 +1336,7 @@ InvalMessageArray
 InvalidationInfo
 InvalidationMsgsGroup
 IoMethodOps
+IOUsage
 IpcMemoryId
 IpcMemoryKey
 IpcMemoryState
-- 
2.43.0

