From fbbd64e8bf53c0ab149bac7970df9b4c1ac92589 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 19 Sep 2023 17:56:56 +0300
Subject: [PATCH v3 1/2] Show WAL stats (except streaming replication WAL) on
 pg_stat_io

This patch aims to showing WAL stats per backend on pg_stat_io view.

With this patch, it can be seen how many WAL operations it makes, their
context, types and total timings per backend in pg_stat_io view.

This patchset currently covers:
- IOOBJECT_WAL / IOCONTEXT_NORMAL read, write and fsync.
- IOOBJECT_WAL / IOCONTEXT_INIT write and fsync.

doesn't cover:
- Streaming replication WAL IO.
---
 src/backend/access/transam/xlog.c         |  60 ++++------
 src/backend/access/transam/xlogrecovery.c |   9 ++
 src/backend/utils/activity/pgstat_io.c    | 137 ++++++++++++++++++++--
 src/backend/utils/adt/pgstatfuncs.c       |  10 +-
 src/include/pgstat.h                      |   8 +-
 5 files changed, 172 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fcbde10529b..5c0da34bc45 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2200,38 +2200,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 			Size		nbytes;
 			Size		nleft;
 			int			written;
-			instr_time	start;
+			instr_time	io_start;
 
 			/* OK to write the page(s) */
 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
 			nbytes = npages * (Size) XLOG_BLCKSZ;
 			nleft = nbytes;
+
+			io_start = pgstat_prepare_io_time();
 			do
 			{
 				errno = 0;
 
-				/* Measure I/O timing to write WAL data */
-				if (track_wal_io_timing)
-					INSTR_TIME_SET_CURRENT(start);
-				else
-					INSTR_TIME_SET_ZERO(start);
-
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
 				pgstat_report_wait_end();
 
-				/*
-				 * Increment the I/O timing and the number of times WAL data
-				 * were written out to disk.
-				 */
-				if (track_wal_io_timing)
-				{
-					instr_time	end;
-
-					INSTR_TIME_SET_CURRENT(end);
-					INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, end, start);
-				}
-
 				PendingWalStats.wal_write++;
 
 				if (written <= 0)
@@ -2256,6 +2240,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 				startoffset += written;
 			} while (nleft > 0);
 
+			pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
+			                        IOOP_WRITE, io_start, npages);
+
 			npages = 0;
 
 			/*
@@ -2949,6 +2936,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	int			fd;
 	int			save_errno;
 	int			open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
+	instr_time	io_start;
 
 	Assert(logtli != 0);
 
@@ -2992,6 +2980,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
+	/* start timing writes for stats */
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
@@ -3027,6 +3018,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
+	                        io_start, 1);
+
 	if (save_errno)
 	{
 		/*
@@ -3043,6 +3037,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				 errmsg("could not write to file \"%s\": %m", tmppath)));
 	}
 
+	/* start timing fsyncs for stats */
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
 	if (pg_fsync(fd) != 0)
 	{
@@ -3055,6 +3052,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
+	                        IOOP_FSYNC, io_start, 1);
+
 	if (close(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -8186,7 +8186,7 @@ void
 issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 {
 	char	   *msg = NULL;
-	instr_time	start;
+	instr_time	io_start;
 
 	Assert(tli != 0);
 
@@ -8199,11 +8199,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 		sync_method == SYNC_METHOD_OPEN_DSYNC)
 		return;
 
-	/* Measure I/O timing to sync the WAL file */
-	if (track_wal_io_timing)
-		INSTR_TIME_SET_CURRENT(start);
-	else
-		INSTR_TIME_SET_ZERO(start);
+	io_start = pgstat_prepare_io_time();
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
 	switch (sync_method)
@@ -8247,18 +8243,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 
 	pgstat_report_wait_end();
 
-	/*
-	 * Increment the I/O timing and the number of times WAL files were synced.
-	 */
-	if (track_wal_io_timing)
-	{
-		instr_time	end;
-
-		INSTR_TIME_SET_CURRENT(end);
-		INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start);
-	}
-
-	PendingWalStats.wal_sync++;
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
+	                        io_start, 1);
 }
 
 /*
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index becc2bda62e..ca1041c5fd7 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1745,6 +1745,9 @@ PerformWalRecovery(void)
 			 */
 			ApplyWalRecord(xlogreader, record, &replayTLI);
 
+			/*  Report pending statistics to the cumulative stats system  */
+			pgstat_report_wal(false);
+
 			/* Exit loop if we reached inclusive recovery target */
 			if (recoveryStopsAfter(xlogreader))
 			{
@@ -3220,6 +3223,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	uint32		targetPageOff;
 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
 	int			r;
+	instr_time	io_start;
 
 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
 	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
@@ -3312,6 +3316,8 @@ retry:
 	/* Read the requested page */
 	readOff = targetPageOff;
 
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
 	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
 	if (r != XLOG_BLCKSZ)
@@ -3340,6 +3346,9 @@ retry:
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
+	                        io_start, 1);
+
 	Assert(targetSegNo == readSegNo);
 	Assert(targetPageOff == readOff);
 	Assert(reqLen <= readLen);
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index eb7d35d4225..d5345ba2536 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -18,6 +18,7 @@
 
 #include "executor/instrument.h"
 #include "storage/bufmgr.h"
+#include "access/xlog.h"
 #include "utils/pgstat_internal.h"
 
 
@@ -87,24 +88,58 @@ pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint3
 	Assert((unsigned int) io_op < IOOP_NUM_TYPES);
 	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
+	/*
+	 * Only count PendingWalStats.wal_sync in pg_stat_io because
+	 * PendingWalStats.wal_write could count different than what pg_stat_io
+	 * counts e.g. system calls. Also, PendingWalStats doesn't count WAL init
+	 * or WAL reads. So, they are not included too.
+	 */
+	if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
+	    io_op == IOOP_FSYNC)
+		PendingWalStats.wal_sync += cnt;
+
 	PendingIOStats.counts[io_object][io_context][io_op] += cnt;
 
 	have_iostats = true;
 }
 
+/*
+ * Prepares io_time for pgstat_count_io_op_time() function. It needs to return
+ * current time if there is a chance that any 'time' can be tracked.
+ */
 instr_time
 pgstat_prepare_io_time(void)
 {
 	instr_time	io_start;
 
-	if (track_io_timing)
+	if (track_io_timing || track_wal_io_timing)
 		INSTR_TIME_SET_CURRENT(io_start);
 	else
+		/*
+		 * If time won't be tracked, there is no need to set io_start but
+		 * compiler complains about uninitialized use. So, set it to zero.
+		 */
 		INSTR_TIME_SET_ZERO(io_start);
 
 	return io_start;
 }
 
+/*
+ * Decide if the io timing needs be tracked
+ */
+bool
+pgstat_should_track_io_time(IOObject io_object, IOContext io_context)
+{
+	/*
+	 * io times of IOOBJECT_WAL IOObject needs to be tracked when
+	 * 'track_wal_io_timing' is set regardless of 'track_io_timing'.
+	 */
+	if (io_object == IOOBJECT_WAL)
+		return track_wal_io_timing;
+
+	return track_io_timing;
+}
+
 /*
  * Like pgstat_count_io_op_n() except it also accumulates time.
  */
@@ -112,18 +147,40 @@ void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 						instr_time start_time, uint32 cnt)
 {
-	if (track_io_timing)
+	/*
+	 * Accumulate different type of times here. We want to eventually
+	 * deduplicate these counters, so we are consolidating them first.
+	 * This also makes it easy to compare what is tracked for which
+	 * stats or instrumentation purpose.
+	 *
+	 * Some IO counters didn't moved here because they track at a different
+	 * level of granularity or at a different point in the call stack.
+	 *
+	 * pgstat_count_buffer is for pgstat_database. Since pg_stat_database only
+	 * counts blk_read_time and blk_write_time, it is set for IOOP_READ and
+	 * IOOP_WRITE.
+	 *
+	 * pgBufferUsage is for EXPLAIN. pgBufferUsage has only write and read stats
+	 * for shared/local and temporary blocks. Only shared/local blocks are counted
+	 * here.
+	 *
+	 * PendingWalStats is for counting WAL stats on pg_stat_wal. It
+	 * only counts IOOBJECT_WAL / IOCONTEXT_NORMAL.
+	 *
+	 * At the end of the if case, accumulate time for pg_stat_io.
+	 */
+	if (pgstat_should_track_io_time(io_object, io_context))
 	{
 		instr_time	io_time;
 
 		INSTR_TIME_SET_CURRENT(io_time);
 		INSTR_TIME_SUBTRACT(io_time, start_time);
 
-		if (io_op == IOOP_WRITE)
+		if (io_op == IOOP_FSYNC)
 		{
-			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-			if (io_object == IOOBJECT_RELATION)
-				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+			/* Track IOOBJECT_WAL/IOCONTEXT_NORMAL times on PendingWalStats */
+			if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL)
+				INSTR_TIME_ADD(PendingWalStats.wal_sync_time, io_time);
 		}
 		else if (io_op == IOOP_READ)
 		{
@@ -131,11 +188,22 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			if (io_object == IOOBJECT_RELATION)
 				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
 		}
+		else if (io_op == IOOP_WRITE)
+		{
+			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+			if (io_object == IOOBJECT_RELATION)
+				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+
+			/* Track IOOBJECT_WAL/IOCONTEXT_NORMAL times on PendingWalStats */
+			else if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL)
+				INSTR_TIME_ADD(PendingWalStats.wal_write_time, io_time);
+		}
 
 		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
 					   io_time);
 	}
 
+	/* IO OP times are counted, now count IO OPs */
 	pgstat_count_io_op_n(io_object, io_context, io_op, cnt);
 }
 
@@ -216,12 +284,33 @@ pgstat_get_io_context_name(IOContext io_context)
 			return "normal";
 		case IOCONTEXT_VACUUM:
 			return "vacuum";
+		case IOCONTEXT_INIT:
+			return "init";
 	}
 
 	elog(ERROR, "unrecognized IOContext value: %d", io_context);
 	pg_unreachable();
 }
 
+/*
+ * op_bytes can change according to IOObject and IOContext.
+ * Return BLCKSZ as default because most of the
+ * IOObject / IOContext uses BLCKSZ.
+ */
+int
+pgstat_get_io_op_bytes(IOObject io_object, IOContext io_context)
+{
+	if (io_object == IOOBJECT_WAL)
+	{
+		if (io_context == IOCONTEXT_NORMAL)
+			return XLOG_BLCKSZ;
+		else if (io_context == IOCONTEXT_INIT)
+			return wal_segment_size;
+	}
+
+	return BLCKSZ;
+}
+
 const char *
 pgstat_get_io_object_name(IOObject io_object)
 {
@@ -231,6 +320,8 @@ pgstat_get_io_object_name(IOObject io_object)
 			return "relation";
 		case IOOBJECT_TEMP_RELATION:
 			return "temp relation";
+		case IOOBJECT_WAL:
+			return "wal";
 	}
 
 	elog(ERROR, "unrecognized IOObject value: %d", io_object);
@@ -312,10 +403,10 @@ pgstat_tracks_io_bktype(BackendType bktype)
 		case B_INVALID:
 		case B_ARCHIVER:
 		case B_LOGGER:
-		case B_WAL_RECEIVER:
-		case B_WAL_WRITER:
 			return false;
 
+		case B_WAL_RECEIVER:
+		case B_WAL_WRITER:
 		case B_AUTOVAC_LAUNCHER:
 		case B_AUTOVAC_WORKER:
 		case B_BACKEND:
@@ -350,6 +441,15 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 	if (!pgstat_tracks_io_bktype(bktype))
 		return false;
 
+	/*
+	 * Currently, IO on IOOBJECT_WAL IOObject can only occur in the
+	 * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContext.
+	 */
+	if (io_object == IOOBJECT_WAL &&
+	    (io_context != IOCONTEXT_NORMAL &&
+	     io_context != IOCONTEXT_INIT))
+		return false;
+
 	/*
 	 * Currently, IO on temporary relations can only occur in the
 	 * IOCONTEXT_NORMAL IOContext.
@@ -448,6 +548,27 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 	if (!strategy_io_context && io_op == IOOP_REUSE)
 		return false;
 
+	/*
+	 * Some IOOps are not valid in certain IOContexts & IOObjects and some
+	 * IOOps are only valid in certain IOContexts & IOObjects.
+	 */
+
+	/*
+	 * In IOOBJECT_WAL io_object, IOCONTEXT_INIT io_context means
+	 * operations done while creating new WAL segments.
+	 */
+	if(io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
+	   !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
+	   return false;
+
+	/*
+	 * In IOOBJECT_WAL io_object, IOCONTEXT_NORMAL io_context means
+	 * operations done on already created WAL segments.
+	 */
+	if(io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
+	   !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
+	   return false;
+
 	/*
 	 * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
 	 * counted in the IOCONTEXT_NORMAL IOContext. See comment in
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 3b44af80066..18bff382363 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1383,6 +1383,7 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 			for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 			{
 				const char *context_name = pgstat_get_io_context_name(io_context);
+				int			op_bytes;
 
 				Datum		values[IO_NUM_COLUMNS] = {0};
 				bool		nulls[IO_NUM_COLUMNS] = {0};
@@ -1401,12 +1402,11 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 				values[IO_COL_RESET_TIME] = TimestampTzGetDatum(reset_time);
 
 				/*
-				 * Hard-code this to the value of BLCKSZ for now. Future
-				 * values could include XLOG_BLCKSZ, once WAL IO is tracked,
-				 * and constant multipliers, once non-block-oriented IO (e.g.
-				 * temporary file IO) is tracked.
+				 * op_bytes can change according to IOObject and IOContext.
+				 * Get the correct op_bytes.
 				 */
-				values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ);
+				op_bytes = pgstat_get_io_op_bytes(io_obj, io_context);
+				values[IO_COL_CONVERSION] = Int64GetDatum(op_bytes);
 
 				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 				{
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 57a2c0866a2..21b6b8ca31c 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -277,9 +277,10 @@ typedef enum IOObject
 {
 	IOOBJECT_RELATION,
 	IOOBJECT_TEMP_RELATION,
+	IOOBJECT_WAL,
 } IOObject;
 
-#define IOOBJECT_NUM_TYPES (IOOBJECT_TEMP_RELATION + 1)
+#define IOOBJECT_NUM_TYPES (IOOBJECT_WAL + 1)
 
 typedef enum IOContext
 {
@@ -287,9 +288,10 @@ typedef enum IOContext
 	IOCONTEXT_BULKWRITE,
 	IOCONTEXT_NORMAL,
 	IOCONTEXT_VACUUM,
+	IOCONTEXT_INIT,
 } IOContext;
 
-#define IOCONTEXT_NUM_TYPES (IOCONTEXT_VACUUM + 1)
+#define IOCONTEXT_NUM_TYPES (IOCONTEXT_INIT + 1)
 
 typedef enum IOOp
 {
@@ -521,10 +523,12 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
 extern void pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt);
 extern instr_time pgstat_prepare_io_time(void);
+extern bool pgstat_should_track_io_time(IOObject io_object, IOContext io_context);
 extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
 									IOOp io_op, instr_time start_time, uint32 cnt);
 
 extern PgStat_IO *pgstat_fetch_stat_io(void);
+extern int pgstat_get_io_op_bytes(IOObject io_object, IOContext io_context);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
 
-- 
2.40.1

