From 574fdec6ed8073dbc49053e6933db0310c7c62f5 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Thu, 3 Aug 2023 16:11:16 +0300
Subject: [PATCH v2] Show WAL stats on pg_stat_io

This patch aims to showing WAL stats per backend on pg_stat_io view.

With this patch, it can be seen how many WAL operations it makes, their
context, types and total timings per backend in pg_stat_io view.
---
 src/backend/access/transam/xlog.c         | 58 ++++++----------
 src/backend/access/transam/xlogrecovery.c | 17 +++++
 src/backend/utils/activity/pgstat_io.c    | 80 +++++++++++++++++++++--
 src/backend/utils/adt/pgstatfuncs.c       |  4 +-
 src/include/pgstat.h                      | 16 ++++-
 5 files changed, 131 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 60c0b7ec3af..ee7b85e18ca 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2188,38 +2188,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 			Size		nbytes;
 			Size		nleft;
 			int			written;
-			instr_time	start;
+			instr_time	io_start;
 
 			/* OK to write the page(s) */
 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
 			nbytes = npages * (Size) XLOG_BLCKSZ;
 			nleft = nbytes;
+
+			io_start = pgstat_prepare_io_time();
 			do
 			{
 				errno = 0;
 
-				/* Measure I/O timing to write WAL data */
-				if (track_wal_io_timing)
-					INSTR_TIME_SET_CURRENT(start);
-				else
-					INSTR_TIME_SET_ZERO(start);
-
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
 				pgstat_report_wait_end();
 
-				/*
-				 * Increment the I/O timing and the number of times WAL data
-				 * were written out to disk.
-				 */
-				if (track_wal_io_timing)
-				{
-					instr_time	duration;
-
-					INSTR_TIME_SET_CURRENT(duration);
-					INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, duration, start);
-				}
-
 				PendingWalStats.wal_write++;
 
 				if (written <= 0)
@@ -2245,6 +2229,9 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 				startoffset += written;
 			} while (nleft > 0);
 
+			pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
+									IOOP_WRITE, io_start, npages);
+
 			npages = 0;
 
 			/*
@@ -2938,6 +2925,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	int			fd;
 	int			save_errno;
 	int			open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
+	instr_time	io_start;
 
 	Assert(logtli != 0);
 
@@ -2981,6 +2969,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
@@ -3016,6 +3006,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
+						io_start, 1);
+
 	if (save_errno)
 	{
 		/*
@@ -3032,6 +3025,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				 errmsg("could not write to file \"%s\": %m", tmppath)));
 	}
 
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
 	if (pg_fsync(fd) != 0)
 	{
@@ -3044,6 +3039,9 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
+							IOOP_FSYNC, io_start, 1);
+
 	if (close(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -8174,7 +8172,7 @@ void
 issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 {
 	char	   *msg = NULL;
-	instr_time	start;
+	instr_time	io_start;
 
 	Assert(tli != 0);
 
@@ -8187,11 +8185,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 		sync_method == SYNC_METHOD_OPEN_DSYNC)
 		return;
 
-	/* Measure I/O timing to sync the WAL file */
-	if (track_wal_io_timing)
-		INSTR_TIME_SET_CURRENT(start);
-	else
-		INSTR_TIME_SET_ZERO(start);
+	io_start = pgstat_prepare_io_time();
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
 	switch (sync_method)
@@ -8235,18 +8229,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 
 	pgstat_report_wait_end();
 
-	/*
-	 * Increment the I/O timing and the number of times WAL files were synced.
-	 */
-	if (track_wal_io_timing)
-	{
-		instr_time	duration;
-
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, duration, start);
-	}
-
-	PendingWalStats.wal_sync++;
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
+							io_start, 1);
 }
 
 /*
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index becc2bda62e..ee850af5514 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1587,6 +1587,7 @@ PerformWalRecovery(void)
 	XLogRecord *record;
 	bool		reachedRecoveryTarget = false;
 	TimeLineID	replayTLI;
+	uint32		pgstat_report_wal_frequency = 0;
 
 	/*
 	 * Initialize shared variables for tracking progress of WAL replay, as if
@@ -1745,6 +1746,16 @@ PerformWalRecovery(void)
 			 */
 			ApplyWalRecord(xlogreader, record, &replayTLI);
 
+			/*
+			 * Report pending statistics to the cumulative stats system once
+			 * every PGSTAT_REPORT_FREQUENCY times to not hinder performance.
+			 */
+			if (pgstat_report_wal_frequency++ == PGSTAT_REPORT_FREQUENCY)
+			{
+				pgstat_report_wal(false);
+				pgstat_report_wal_frequency = 0;
+			}
+
 			/* Exit loop if we reached inclusive recovery target */
 			if (recoveryStopsAfter(xlogreader))
 			{
@@ -3220,6 +3231,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	uint32		targetPageOff;
 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
 	int			r;
+	instr_time	io_start;
 
 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
 	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
@@ -3312,6 +3324,8 @@ retry:
 	/* Read the requested page */
 	readOff = targetPageOff;
 
+	io_start = pgstat_prepare_io_time();
+
 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
 	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
 	if (r != XLOG_BLCKSZ)
@@ -3340,6 +3354,9 @@ retry:
 	}
 	pgstat_report_wait_end();
 
+	pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
+							io_start, 1);
+
 	Assert(targetSegNo == readSegNo);
 	Assert(targetPageOff == readOff);
 	Assert(reqLen <= readLen);
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index eb7d35d4225..97689a0637e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -87,17 +87,25 @@ pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint3
 	Assert((unsigned int) io_op < IOOP_NUM_TYPES);
 	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
+	if(io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
+	   io_op == IOOP_FSYNC)
+		PendingWalStats.wal_sync += cnt;
+
 	PendingIOStats.counts[io_object][io_context][io_op] += cnt;
 
 	have_iostats = true;
 }
 
+/*
+ * Prepares io_time for pgstat_count_io_op_time() function. It needs to return
+ * current time if there is a chance that any 'time' can be tracked.
+ */
 instr_time
 pgstat_prepare_io_time(void)
 {
 	instr_time	io_start;
 
-	if (track_io_timing)
+	if(track_io_timing || track_wal_io_timing)
 		INSTR_TIME_SET_CURRENT(io_start);
 	else
 		INSTR_TIME_SET_ZERO(io_start);
@@ -105,6 +113,22 @@ pgstat_prepare_io_time(void)
 	return io_start;
 }
 
+/*
+ * Decide if the io timing needs be tracked
+ */
+bool
+pgstat_should_track_io_time(IOObject io_object, IOContext io_context)
+{
+	/*
+	 * io times of IOOBJECT_WAL IOObject needs to be tracked when
+	 * 'track_wal_io_timing' is set regardless of 'track_io_timing'.
+	 */
+	if (io_object == IOOBJECT_WAL)
+		return track_wal_io_timing;
+
+	return track_io_timing;
+}
+
 /*
  * Like pgstat_count_io_op_n() except it also accumulates time.
  */
@@ -112,7 +136,7 @@ void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 						instr_time start_time, uint32 cnt)
 {
-	if (track_io_timing)
+	if (pgstat_should_track_io_time(io_object, io_context))
 	{
 		instr_time	io_time;
 
@@ -124,6 +148,9 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
 			if (io_object == IOOBJECT_RELATION)
 				INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+			/* Track IOOBJECT_WAL/IOCONTEXT_NORMAL times on PendingWalStats */
+			else if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL)
+				INSTR_TIME_ADD(PendingWalStats.wal_write_time, io_time);
 		}
 		else if (io_op == IOOP_READ)
 		{
@@ -131,6 +158,12 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			if (io_object == IOOBJECT_RELATION)
 				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
 		}
+		else if (io_op == IOOP_FSYNC)
+		{
+			/* Track IOOBJECT_WAL/IOCONTEXT_NORMAL times on PendingWalStats */
+			if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL)
+				INSTR_TIME_ADD(PendingWalStats.wal_sync_time, io_time);
+		}
 
 		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
 					   io_time);
@@ -216,12 +249,32 @@ pgstat_get_io_context_name(IOContext io_context)
 			return "normal";
 		case IOCONTEXT_VACUUM:
 			return "vacuum";
+		case IOCONTEXT_INIT:
+			return "init";
 	}
 
 	elog(ERROR, "unrecognized IOContext value: %d", io_context);
 	pg_unreachable();
 }
 
+/*
+ * op_bytes can change according to IOObject and IOContext.
+ * Return BLCKSZ as default.
+ */
+int
+pgstat_get_io_op_btyes(IOObject io_object, IOContext io_context)
+{
+	if (io_object == IOOBJECT_WAL)
+	{
+		if (io_context == IOCONTEXT_NORMAL)
+			return XLOG_BLCKSZ;
+		else if (io_context == IOCONTEXT_INIT)
+			return wal_segment_size;
+	}
+
+	return BLCKSZ;
+}
+
 const char *
 pgstat_get_io_object_name(IOObject io_object)
 {
@@ -231,6 +284,8 @@ pgstat_get_io_object_name(IOObject io_object)
 			return "relation";
 		case IOOBJECT_TEMP_RELATION:
 			return "temp relation";
+		case IOOBJECT_WAL:
+			return "wal";
 	}
 
 	elog(ERROR, "unrecognized IOObject value: %d", io_object);
@@ -312,10 +367,10 @@ pgstat_tracks_io_bktype(BackendType bktype)
 		case B_INVALID:
 		case B_ARCHIVER:
 		case B_LOGGER:
-		case B_WAL_RECEIVER:
-		case B_WAL_WRITER:
 			return false;
 
+		case B_WAL_RECEIVER:
+		case B_WAL_WRITER:
 		case B_AUTOVAC_LAUNCHER:
 		case B_AUTOVAC_WORKER:
 		case B_BACKEND:
@@ -350,6 +405,15 @@ pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
 	if (!pgstat_tracks_io_bktype(bktype))
 		return false;
 
+	/*
+	 * Currently, IO on IOOBJECT_WAL IOObject can only occur in the
+	 * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContext.
+	 */
+	if (io_object == IOOBJECT_WAL &&
+		(io_context != IOCONTEXT_NORMAL &&
+	     io_context != IOCONTEXT_INIT))
+		return false;
+
 	/*
 	 * Currently, IO on temporary relations can only occur in the
 	 * IOCONTEXT_NORMAL IOContext.
@@ -439,6 +503,14 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
 	if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
 		return false;
 
+	if(io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
+	   !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
+	   return false;
+
+	if(io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
+	   !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
+	   return false;
+
 	strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
 		io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 2a4c8ef87ff..c467891e5fb 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1386,6 +1386,7 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 			for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
 			{
 				const char *context_name = pgstat_get_io_context_name(io_context);
+				int			op_bytes;
 
 				Datum		values[IO_NUM_COLUMNS] = {0};
 				bool		nulls[IO_NUM_COLUMNS] = {0};
@@ -1409,7 +1410,8 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
 				 * and constant multipliers, once non-block-oriented IO (e.g.
 				 * temporary file IO) is tracked.
 				 */
-				values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ);
+				op_bytes = pgstat_get_io_op_btyes(io_obj, io_context);
+				values[IO_COL_CONVERSION] = Int64GetDatum(op_bytes);
 
 				for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 				{
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 57a2c0866a2..04deb2aa509 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -277,9 +277,10 @@ typedef enum IOObject
 {
 	IOOBJECT_RELATION,
 	IOOBJECT_TEMP_RELATION,
+	IOOBJECT_WAL,
 } IOObject;
 
-#define IOOBJECT_NUM_TYPES (IOOBJECT_TEMP_RELATION + 1)
+#define IOOBJECT_NUM_TYPES (IOOBJECT_WAL + 1)
 
 typedef enum IOContext
 {
@@ -287,9 +288,10 @@ typedef enum IOContext
 	IOCONTEXT_BULKWRITE,
 	IOCONTEXT_NORMAL,
 	IOCONTEXT_VACUUM,
+	IOCONTEXT_INIT,
 } IOContext;
 
-#define IOCONTEXT_NUM_TYPES (IOCONTEXT_VACUUM + 1)
+#define IOCONTEXT_NUM_TYPES (IOCONTEXT_INIT + 1)
 
 typedef enum IOOp
 {
@@ -439,6 +441,12 @@ typedef struct PgStat_WalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_WalStats;
 
+/*
+ * When stats captured in resource-intensive/tight loops, report pending stats
+ * once every PGSTAT_REPORT_FREQUENCY times.
+ */
+#define PGSTAT_REPORT_FREQUENCY		1024
+
 /*
  * This struct stores wal-related durations as instr_time, which makes it
  * cheaper and easier to accumulate them, by not requiring type
@@ -521,10 +529,12 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op);
 extern void pgstat_count_io_op_n(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt);
 extern instr_time pgstat_prepare_io_time(void);
+extern bool pgstat_should_track_io_time(IOObject io_object, IOContext io_context);
 extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
 									IOOp io_op, instr_time start_time, uint32 cnt);
 
 extern PgStat_IO *pgstat_fetch_stat_io(void);
+extern int pgstat_get_io_op_btyes(IOObject io_object, IOContext io_context);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
 extern const char *pgstat_get_io_object_name(IOObject io_object);
 
@@ -770,6 +780,8 @@ extern PGDLLIMPORT SessionEndType pgStatSessionEndCause;
 /*
  * Variables in pgstat_wal.c
  */
+extern PGDLLIMPORT bool track_wal_io_timing;
+extern PGDLLIMPORT int wal_segment_size;
 
 /* updated directly by backends and background processes */
 extern PGDLLIMPORT PgStat_PendingWalStats PendingWalStats;
-- 
2.40.1

