From 752fd6cc636eea08c06b25d8898e091835442387 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 5 Aug 2021 15:47:50 -0400
Subject: [PATCH v2 2/2] Add IO wait time stat and add guc for BHS prefetch

- Add an IO wait time measurement which can be seen in EXPLAIN output
  (with explain buffers option and track_io_timing on)
- TODO: add the wait time to database statistics
- Also, add a GUC to control the BHS pre-fetch max window size
---
 src/backend/access/heap/heapam_handler.c |  2 +-
 src/backend/commands/explain.c           |  9 ++++++++-
 src/backend/executor/instrument.c        |  4 ++++
 src/backend/postmaster/pgstat.c          |  6 ++++++
 src/backend/storage/aio/aio.c            | 16 ++++++++++++++++
 src/backend/storage/aio/aio_util.c       |  2 --
 src/backend/storage/buffer/bufmgr.c      |  2 ++
 src/backend/utils/adt/pgstatfuncs.c      |  2 ++
 src/backend/utils/misc/guc.c             | 10 ++++++++++
 src/include/executor/instrument.h        |  1 +
 src/include/pgstat.h                     |  6 ++++++
 src/include/storage/bufmgr.h             |  1 +
 12 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index a8bd8050dc..7409dd2fb3 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2221,7 +2221,7 @@ void bitmapheap_pgsr_alloc(BitmapHeapScanState *scanstate)
 	HeapScanDesc hscan = (HeapScanDesc ) scanstate->ss.ss_currentScanDesc;
 	if (!hscan->rs_inited)
 	{
-		int iodepth = Max(Min(128, NBuffers / 128), 1);
+		int iodepth = io_bitmap_prefetch_depth;
 		hscan->pgsr = pg_streaming_read_alloc(iodepth, (uintptr_t) scanstate,
 		                                      bitmapheapscan_pgsr_next_single,
 		                                      bitmapheapscan_pgsr_release);
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index c66d39a5c7..d1e89f1e1b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3339,7 +3339,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 		bool		has_temp = (usage->temp_blks_read > 0 ||
 								usage->temp_blks_written > 0);
 		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
-								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+								  !INSTR_TIME_IS_ZERO(usage->blk_write_time ||
+								  !INSTR_TIME_IS_ZERO(usage->io_wait_time)));
 		bool		show_planning = (planning && (has_shared ||
 												  has_local || has_temp || has_timing));
 
@@ -3416,6 +3417,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
 				appendStringInfo(es->str, " write=%0.3f",
 								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+			if (!INSTR_TIME_IS_ZERO(usage->io_wait_time))
+				appendStringInfo(es->str, " wait=%0.3f",
+								 INSTR_TIME_GET_MILLISEC(usage->io_wait_time));
 			appendStringInfoChar(es->str, '\n');
 		}
 
@@ -3452,6 +3456,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			ExplainPropertyFloat("I/O Write Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time),
 								 3, es);
+			ExplainPropertyFloat("I/O Wait Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->io_wait_time),
+								 5, es);
 		}
 	}
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 96af2a2673..85e7321d63 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -218,6 +218,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->temp_blks_written += add->temp_blks_written;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+	INSTR_TIME_ADD(dst->io_wait_time, add->io_wait_time);
+
 }
 
 /* dst += add - sub */
@@ -240,6 +242,8 @@ BufferUsageAccumDiff(BufferUsage *dst,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
 						  add->blk_write_time, sub->blk_write_time);
+	INSTR_TIME_ACCUM_DIFF(dst->io_wait_time,
+						  add->io_wait_time, sub->io_wait_time);
 }
 
 /* helper functions for WAL usage accumulation */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 9d335b8507..028ca14aa8 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -258,6 +258,7 @@ static int	pgStatXactCommit = 0;
 static int	pgStatXactRollback = 0;
 PgStat_Counter pgStatBlockReadTime = 0;
 PgStat_Counter pgStatBlockWriteTime = 0;
+PgStat_Counter pgStatIOWaitTime = 0;
 static PgStat_Counter pgStatActiveTime = 0;
 static PgStat_Counter pgStatTransactionIdleTime = 0;
 SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL;
@@ -1004,10 +1005,12 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
 		tsmsg->m_xact_rollback = pgStatXactRollback;
 		tsmsg->m_block_read_time = pgStatBlockReadTime;
 		tsmsg->m_block_write_time = pgStatBlockWriteTime;
+		tsmsg->m_io_wait_time = pgStatIOWaitTime;
 		pgStatXactCommit = 0;
 		pgStatXactRollback = 0;
 		pgStatBlockReadTime = 0;
 		pgStatBlockWriteTime = 0;
+		pgStatIOWaitTime = 0;
 	}
 	else
 	{
@@ -1015,6 +1018,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
 		tsmsg->m_xact_rollback = 0;
 		tsmsg->m_block_read_time = 0;
 		tsmsg->m_block_write_time = 0;
+		tsmsg->m_io_wait_time = 0;
 	}
 
 	n = tsmsg->m_nentries;
@@ -5122,6 +5126,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 	dbentry->last_checksum_failure = 0;
 	dbentry->n_block_read_time = 0;
 	dbentry->n_block_write_time = 0;
+	dbentry->n_io_wait_time = 0;
 	dbentry->n_sessions = 0;
 	dbentry->total_session_time = 0;
 	dbentry->total_active_time = 0;
@@ -6442,6 +6447,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 	dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
 	dbentry->n_block_read_time += msg->m_block_read_time;
 	dbentry->n_block_write_time += msg->m_block_write_time;
+	dbentry->n_io_wait_time += msg->m_io_wait_time;
 
 	/*
 	 * Process all table entries in the message.
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 726d50f698..c5aacbcf9d 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -36,6 +36,8 @@
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
+#include "executor/instrument.h"
+#include "storage/bufmgr.h"
 
 
 #define PGAIO_VERBOSE
@@ -1729,6 +1731,12 @@ wait_ref_again:
 		}
 		else if (io_method != IOMETHOD_WORKER && (flags & PGAIOIP_INFLIGHT))
 		{
+			int io_wait_start, io_wait_time;
+			if (track_io_timing)
+				INSTR_TIME_SET_CURRENT(io_wait_start);
+			else
+				INSTR_TIME_SET_ZERO(io_wait_start);
+
 			/* note that this is allowed to spuriously return */
 			if (io_method == IOMETHOD_WORKER)
 				ConditionVariableSleep(&io->cv, WAIT_EVENT_AIO_IO_COMPLETE_ONE);
@@ -1741,6 +1749,14 @@ wait_ref_again:
 			else if (io_method == IOMETHOD_POSIX)
 				pgaio_posix_wait_one(io, ref_generation);
 #endif
+
+			if (track_io_timing)
+			{
+				INSTR_TIME_SET_CURRENT(io_wait_time);
+				INSTR_TIME_SUBTRACT(io_wait_time, io_wait_start);
+				pgstat_count_io_wait_time(INSTR_TIME_GET_MICROSEC(io_wait_time));
+				INSTR_TIME_ADD(pgBufferUsage.io_wait_time, io_wait_time);
+			}
 		}
 #ifdef USE_POSIX_AIO
 		/* XXX untangle this */
diff --git a/src/backend/storage/aio/aio_util.c b/src/backend/storage/aio/aio_util.c
index 35436dfcd3..a79db4d747 100644
--- a/src/backend/storage/aio/aio_util.c
+++ b/src/backend/storage/aio/aio_util.c
@@ -417,8 +417,6 @@ pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private,
 {
 	PgStreamingRead *pgsr;
 
-	iodepth = Max(Min(iodepth, NBuffers / 128), 1);
-
 	pgsr = palloc0(offsetof(PgStreamingRead, all_items) +
 				   sizeof(PgStreamingReadItem) * iodepth * 2);
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f5df1b78f2..69175699b0 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -145,6 +145,8 @@ bool		track_io_timing = false;
  */
 int			effective_io_concurrency = 0;
 
+int  io_bitmap_prefetch_depth = 128;
+
 /*
  * Like effective_io_concurrency, but used by maintenance code paths that might
  * benefit from a higher setting because they work on behalf of many sessions.
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 2fca05f7af..a6d4f121e1 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1615,6 +1615,8 @@ pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT8(result);
 }
 
+// TODO: add one for io wait time
+
 Datum
 pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d143783f22..f4f02fb9ad 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3029,6 +3029,16 @@ static struct config_int ConfigureNamesInt[] =
 		0, MAX_IO_CONCURRENCY,
 		check_maintenance_io_concurrency, NULL, NULL
 	},
+	{
+		{"io_bitmap_prefetch_depth", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Maximum pre-fetch distance for bitmapheapscan"),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&io_bitmap_prefetch_depth,
+		128, 0, MAX_IO_CONCURRENCY,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"io_wal_concurrency", PGC_POSTMASTER, RESOURCES_ASYNCHRONOUS,
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 62ca398e9d..950414cc0b 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -30,6 +30,7 @@ typedef struct BufferUsage
 	long		temp_blks_written;	/* # of temp blocks written */
 	instr_time	blk_read_time;	/* time spent reading */
 	instr_time	blk_write_time; /* time spent writing */
+	instr_time io_wait_time;
 } BufferUsage;
 
 typedef struct WalUsage
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index b77dcbc58b..a02813f02f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -291,6 +291,7 @@ typedef struct PgStat_MsgTabstat
 	int			m_xact_rollback;
 	PgStat_Counter m_block_read_time;	/* times in microseconds */
 	PgStat_Counter m_block_write_time;
+	PgStat_Counter m_io_wait_time;
 	PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
 } PgStat_MsgTabstat;
 
@@ -732,6 +733,7 @@ typedef struct PgStat_StatDBEntry
 	TimestampTz last_checksum_failure;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
+	PgStat_Counter n_io_wait_time;
 	PgStat_Counter n_sessions;
 	PgStat_Counter total_session_time;
 	PgStat_Counter total_active_time;
@@ -1417,6 +1419,8 @@ extern PgStat_MsgWal WalStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_Counter pgStatIOWaitTime;
+
 /*
  * Updated by the traffic cop and in errfinish()
  */
@@ -1593,6 +1597,8 @@ pgstat_report_wait_end(void)
 	(pgStatBlockReadTime += (n))
 #define pgstat_count_buffer_write_time(n)							\
 	(pgStatBlockWriteTime += (n))
+#define pgstat_count_io_wait_time(n)							\
+	(pgStatIOWaitTime  += (n))
 
 extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n);
 extern void pgstat_count_heap_update(Relation rel, bool hot);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 07401f8493..8c21bb6e56 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -70,6 +70,7 @@ extern int	bgwriter_lru_maxpages;
 extern double bgwriter_lru_multiplier;
 extern bool track_io_timing;
 extern int	effective_io_concurrency;
+extern int io_bitmap_prefetch_depth;
 extern int	maintenance_io_concurrency;
 
 extern int	checkpoint_flush_after;
-- 
2.27.0

