From f4f1534c105766c64534810b62eba7fea674c2a5 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 20 Mar 2023 19:26:41 -0700
Subject: [PATCH v6 2/2] pgstat: io: increment pgstat_io hackery

---
 src/include/pgstat.h                   | 15 ++---
 src/backend/storage/buffer/bufmgr.c    | 21 ++++---
 src/backend/storage/buffer/localbuf.c  |  5 +-
 src/backend/storage/smgr/md.c          | 12 ++--
 src/backend/utils/activity/pgstat.c    | 77 +-------------------------
 src/backend/utils/activity/pgstat_io.c | 70 ++++++++++++++++++++---
 6 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 288a8c7c0ef..4adbf29fdfd 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -305,11 +305,7 @@ typedef enum IOOp
 typedef struct PgStat_BktypeIO
 {
 	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-	union
-	{
-		instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-		PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-	};
+	PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_BktypeIO;
 
 typedef struct PgStat_IO
@@ -471,11 +467,6 @@ extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot);
 extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str);
 extern bool pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid);
 
-extern instr_time pgstat_io_start(void);
-
-extern void pgstat_io_end(instr_time io_start, IOObject io_object,
-						  IOContext io_context, IOOp io_op);
-
 
 /*
  * Functions in pgstat_archiver.c
@@ -507,8 +498,10 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
 
 extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *context_ops,
 										 BackendType bktype);
+extern instr_time pgstat_prepare_io_time(void);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
-extern void pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
+extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time);
+
 extern PgStat_IO *pgstat_fetch_stat_io(void);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 64857f1ff68..532c7af992d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1005,12 +1005,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
 
-		io_start = pgstat_io_start();
+		io_start = pgstat_prepare_io_time();
 
 		/* don't set checksum for all-zero page */
 		smgrextend(smgr, forkNum, blockNum, bufBlock, false);
 
-		pgstat_io_end(io_start, io_object, io_context, IOOP_EXTEND);
+		pgstat_count_io_op_time(io_object, io_context,
+								IOOP_EXTEND, io_start);
 
 		/*
 		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
@@ -1031,11 +1032,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		{
 			instr_time	io_start;
 
-			io_start = pgstat_io_start();
+			io_start = pgstat_prepare_io_time();
 
 			smgrread(smgr, forkNum, blockNum, bufBlock);
 
-			pgstat_io_end(io_start, io_object, io_context, IOOP_READ);
+			pgstat_count_io_op_time(io_object, io_context,
+									IOOP_READ, io_start);
 
 			/* check for garbage data */
 			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
@@ -2942,7 +2944,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 */
 	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
-	io_start = pgstat_io_start();
+	io_start = pgstat_prepare_io_time();
 
 	/*
 	 * bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -2971,7 +2973,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	 * When a strategy is not in use, the write can only be a "regular" write
 	 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
 	 */
-	pgstat_io_end(io_start, IOOBJECT_RELATION, io_context, IOOP_WRITE);
+	pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
+							IOOP_WRITE, io_start);
 
 	/*
 	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
@@ -3601,7 +3604,7 @@ FlushRelationBuffers(Relation rel)
 				 * TODO: is it okay that this formerly didn't count
 				 * pgBufferUsage or pgStatBlockReadTime
 				 */
-				io_start = pgstat_io_start();
+				io_start = pgstat_prepare_io_time();
 
 				smgrwrite(RelationGetSmgr(rel),
 						  BufTagGetForkNum(&bufHdr->tag),
@@ -3612,7 +3615,9 @@ FlushRelationBuffers(Relation rel)
 				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-				pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+				pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
+										IOCONTEXT_NORMAL, IOOP_WRITE,
+										io_start);
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index af45a72934a..23f2e2fb759 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -229,7 +229,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-		io_start = pgstat_io_start();
+		io_start = pgstat_prepare_io_time();
 
 		/* And write... */
 		smgrwrite(oreln,
@@ -242,7 +242,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		buf_state &= ~BM_DIRTY;
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
-		pgstat_io_end(io_start, IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
+		pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
+								IOOP_WRITE, io_start);
 	}
 
 	/*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2440211f03d..51a9134d57f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1030,11 +1030,13 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
 	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
 	{
-		instr_time	io_start = pgstat_io_start();
+		instr_time	io_start;
 
 		ereport(DEBUG1,
 				(errmsg_internal("could not forward fsync request because request queue is full")));
 
+		io_start = pgstat_prepare_io_time();
+
 		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
@@ -1052,7 +1054,8 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 		 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
 		 * backend fsyncs.
 		 */
-		pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+		pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+								IOOP_FSYNC, io_start);
 	}
 }
 
@@ -1428,7 +1431,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		need_to_close = true;
 	}
 
-	io_start = pgstat_io_start();
+	io_start = pgstat_prepare_io_time();
 
 	/* Sync the file. */
 	result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
@@ -1437,7 +1440,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 	if (need_to_close)
 		FileClose(file);
 
-	pgstat_io_end(io_start, IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
+	pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+							IOOP_FSYNC, io_start);
 
 	errno = save_errno;
 	return result;
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 9f5f041d7c2..b125802b215 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -95,11 +95,9 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
-#include "executor/instrument.h"
 #include "lib/dshash.h"
 #include "pgstat.h"
 #include "port/atomics.h"
-#include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
@@ -110,6 +108,7 @@
 #include "utils/pgstat_internal.h"
 #include "utils/timestamp.h"
 
+
 /* ----------
  * Timer definitions.
  *
@@ -907,80 +906,6 @@ pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid)
 	return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL;
 }
 
-instr_time
-pgstat_io_start(void)
-{
-	instr_time	io_start;
-
-	if (track_io_timing)
-		INSTR_TIME_SET_CURRENT(io_start);
-	else
-		INSTR_TIME_SET_ZERO(io_start);
-
-	return io_start;
-}
-
-void
-pgstat_io_end(instr_time io_start, IOObject io_object,
-			  IOContext io_context, IOOp io_op)
-{
-	instr_time	io_time;
-
-	if (track_io_timing)
-	{
-		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_SUBTRACT(io_time, io_start);
-		pgstat_count_io_time(io_object, io_context, io_op, io_time);
-
-		if (io_op == IOOP_WRITE)
-		{
-			if (io_object == IOOBJECT_RELATION)
-			{
-				/* TODO: AFAICT, pgstat_count_buffer_write_time is only called */
-				/* for shared buffers whereas pgstat_count_buffer_read_time is */
-				/* called for temp relations and shared buffers. */
-				/*
-				 * is this intentional and should I match current behavior or
-				 * not?
-				 */
-				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
-			}
-		}
-		else if (io_op == IOOP_READ)
-		{
-			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-			if (io_object == IOOBJECT_RELATION)
-			{
-				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-			}
-		}
-	}
-
-	pgstat_count_io_op(io_object, io_context, io_op);
-
-	if (io_op == IOOP_WRITE)
-	{
-		if (io_object == IOOBJECT_RELATION)
-			pgBufferUsage.shared_blks_written++;
-		else if (io_object == IOOBJECT_TEMP_RELATION)
-			pgBufferUsage.local_blks_written++;
-	}
-
-	/*
-	 * TODO: this is normally done later in ReadBuffer_common() is it okay to
-	 * do here?
-	 */
-	else if (io_op == IOOP_READ)
-	{
-		if (io_object == IOOBJECT_RELATION)
-			pgBufferUsage.shared_blks_read++;
-		else if (io_object == IOOBJECT_TEMP_RELATION)
-			pgBufferUsage.local_blks_read++;
-	}
-}
-
-
 /*
  * Ensure snapshot for fixed-numbered 'kind' exists.
  *
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index fe09cbf16d4..08888fa0c5b 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -16,10 +16,19 @@
 
 #include "postgres.h"
 
+#include "executor/instrument.h"
+#include "storage/bufmgr.h"
 #include "utils/pgstat_internal.h"
 
 
-static PgStat_BktypeIO PendingIOStats;
+typedef struct PgStat_PendingIO
+{
+	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+	instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+} PgStat_PendingIO;
+
+
+static PgStat_PendingIO PendingIOStats;
 bool		have_iostats = false;
 
 
@@ -73,6 +82,19 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 	return true;
 }
 
+instr_time
+pgstat_prepare_io_time(void)
+{
+	instr_time	io_start;
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+	else
+		INSTR_TIME_SET_ZERO(io_start);
+
+	return io_start;
+}
+
 void
 pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 {
@@ -86,18 +108,48 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
 	have_iostats = true;
 }
 
+/*
+ * Like pgstat_count_io_op() except it also accumulates time.
+ */
 void
-pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time)
+pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
+						instr_time start_time)
 {
-	Assert(io_object < IOOBJECT_NUM_TYPES);
-	Assert(io_context < IOCONTEXT_NUM_TYPES);
-	Assert(io_op < IOOP_NUM_TYPES);
-	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
-	Assert(pgstat_tracks_io_time(io_op) != -1);
+	if (track_io_timing)
+	{
+		instr_time	io_time;
 
-	INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], time);
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, start_time);
 
-	have_iostats = true;
+		if (io_op == IOOP_WRITE)
+		{
+			if (io_object == IOOBJECT_RELATION)
+			{
+				/* TODO: AFAICT, pgstat_count_buffer_write_time is only called */
+				/* for shared buffers whereas pgstat_count_buffer_read_time is */
+				/* called for temp relations and shared buffers. */
+				/*
+				 * is this intentional and should I match current behavior or
+				 * not?
+				 */
+				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+			}
+		}
+		else if (io_op == IOOP_READ)
+		{
+			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+			{
+				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+			}
+		}
+
+		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op], io_time);
+	}
+
+	pgstat_count_io_op(io_object, io_context, io_op);
 }
 
 PgStat_IO *
-- 
2.38.0

