From bbd2271de28702fa83b47896ed2762a38701b416 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 3 Jan 2022 18:26:45 -0500
Subject: [PATCH 5/8] Add "buffers" to pgstat_reset_shared_counters

Backends count IO operations for various IO paths in their
PgBackendStatus. Upon exit, they send these counts to the stats
collector.

Prior to this commit, the IO operations stats from exited backends
persisted by the stats collector would have been been reset when
pgstat_reset_shared_counters() was invoked with target "bgwriter".
However the IO operations stats in each live backend's PgBackendStatus
would remain the same. Thus the totals calculated from both live and
exited backends would be incorrect after a reset.

Backends' PgBackendStatuses cannot be written to by another backend;
therefore, in order to calculate correct totals after a reset has
occurred, the backend sending the reset message to the stats collector
now reads the IO operation stats totals from live backends and sends
them to the stats collector to be persisted in an array of "resets"
which can be used to calculate the correct totals after a reset.

Because the IO operations statistics are broader in scope than those in
pg_stat_bgwriter, rename the reset target to "buffers". The "buffers"
target will reset all IO operations statistics and all statistics for
the pg_stat_bgwriter view maintained by the stats collector.

Author: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/20200124195226.lth52iydq2n2uilq%40alap3.anarazel.de
---
 doc/src/sgml/monitoring.sgml                |  2 +-
 src/backend/postmaster/pgstat.c             | 90 +++++++++++++++++----
 src/backend/utils/activity/backend_status.c | 27 +++++++
 src/include/pgstat.h                        | 34 ++++++--
 src/include/utils/backend_status.h          |  2 +
 5 files changed, 134 insertions(+), 21 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3b9172f65bd..dc07cc15f53 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -5186,7 +5186,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
        </para>
        <para>
         Resets some cluster-wide statistics counters to zero, depending on the
-        argument.  The argument can be <literal>bgwriter</literal> to reset
+        argument.  The argument can be <literal>buffers</literal> to reset
         all the counters shown in
         the <structname>pg_stat_bgwriter</structname>
         view, <literal>archiver</literal> to reset all the counters shown in
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index f4c0fd3e8dc..57a7f0fa7e9 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1267,6 +1267,36 @@ pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
 	pgstat_send(&msg, sizeof(msg));
 }
 
+/*
+ * Helper function to collect and send live backends' current IO operations
+ * stats counters when a stats reset is initiated so that they may be deducted
+ * from future totals.
+ */
+static void
+pgstat_send_buffers_reset(PgStat_MsgResetsharedcounter *msg)
+{
+	PgStatIOPathOps ops[BACKEND_NUM_TYPES];
+
+	memset(ops, 0, sizeof(ops));
+	pgstat_report_live_backend_io_path_ops(ops);
+
+	/*
+	 * Iterate through the array of all IOOps for all IOPaths for each
+	 * BackendType.
+	 *
+	 * An individual message is sent for each backend type because sending all
+	 * IO operations in one message would exceed the PGSTAT_MAX_MSG_SIZE of
+	 * 1000.
+	 */
+	for (int i = 0; i < BACKEND_NUM_TYPES; i++)
+	{
+		msg->m_backend_resets.backend_type = idx_get_backend_type(i);
+		memcpy(&msg->m_backend_resets.iop, &ops[i],
+				sizeof(msg->m_backend_resets.iop));
+		pgstat_send(msg, sizeof(PgStat_MsgResetsharedcounter));
+	}
+}
+
 /*
  * Tell the statistics collector to reset cluster-wide shared counters.
  *
@@ -1281,19 +1311,25 @@ pgstat_reset_shared_counters(const char *target)
 	if (pgStatSock == PGINVALID_SOCKET)
 		return;
 
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
+	if (strcmp(target, "buffers") == 0)
+	{
+		msg.m_resettarget = RESET_BUFFERS;
+		pgstat_send_buffers_reset(&msg);
+		return;
+	}
+
 	if (strcmp(target, "archiver") == 0)
 		msg.m_resettarget = RESET_ARCHIVER;
-	else if (strcmp(target, "bgwriter") == 0)
-		msg.m_resettarget = RESET_BGWRITER;
 	else if (strcmp(target, "wal") == 0)
 		msg.m_resettarget = RESET_WAL;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("unrecognized reset target: \"%s\"", target),
-				 errhint("Target must be \"archiver\", \"bgwriter\", or \"wal\".")));
+				 errhint(
+					 "Target must be \"archiver\", \"buffers\", or \"wal\".")));
 
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
 	pgstat_send(&msg, sizeof(msg));
 }
 
@@ -2606,6 +2642,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	ts = GetCurrentTimestamp();
 	globalStats.bgwriter.stat_reset_timestamp = ts;
+	globalStats.buffers.stat_reset_timestamp = ts;
 	archiverStats.stat_reset_timestamp = ts;
 	walStats.stat_reset_timestamp = ts;
 
@@ -3726,21 +3763,46 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 static void
 pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 {
-	if (msg->m_resettarget == RESET_BGWRITER)
-	{
-		/*
-		 * Reset the global, bgwriter and checkpointer statistics for the
-		 * cluster.
-		 */
-		memset(&globalStats, 0, sizeof(globalStats));
-		globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp();
-	}
-	else if (msg->m_resettarget == RESET_ARCHIVER)
+	if (msg->m_resettarget == RESET_ARCHIVER)
 	{
 		/* Reset the archiver statistics for the cluster. */
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_BUFFERS)
+	{
+		/*
+		 * Reset global stats for bgwriter, buffers, and checkpointer.
+		 *
+		 * Because the stats collector cannot write to live backends'
+		 * PgBackendStatuses, it maintains an array of "resets". The reset
+		 * message contains the current values of these counters for live
+		 * backends. The stats collector saves these in its "resets" array,
+		 * then zeroes out the exited backends' saved IO operations counters.
+		 * This is required to calculate an accurate total for each IO
+		 * operations counter post reset.
+		 */
+		BackendType backend_type = msg->m_backend_resets.backend_type;
+
+		/*
+		 * We reset each member individually (as opposed to resetting the
+		 * entire globalStats struct) because we need to preserve the resets
+		 * array (globalStats.buffers.resets).
+		 *
+		 * Though globalStats.buffers.ops, globalStats.bgwriter, and
+		 * globalStats.checkpointer only need to be reset once, doing so for
+		 * every message is less brittle and the extra cost is irrelevant given
+		 * how often stats are reset.
+		 */
+		memset(&globalStats.bgwriter, 0, sizeof(globalStats.bgwriter));
+		memset(&globalStats.checkpointer, 0, sizeof(globalStats.checkpointer));
+		memset(&globalStats.buffers.ops, 0, sizeof(globalStats.buffers.ops));
+		globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp();
+		globalStats.buffers.stat_reset_timestamp = GetCurrentTimestamp();
+		memcpy(&globalStats.buffers.resets[backend_type_get_idx(backend_type)],
+				&msg->m_backend_resets.iop.io_path_ops,
+				sizeof(msg->m_backend_resets.iop.io_path_ops));
+	}
 	else if (msg->m_resettarget == RESET_WAL)
 	{
 		/* Reset the WAL statistics for the cluster. */
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index 79410e0b2c2..87b9d0fc0d8 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -636,6 +636,33 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	PGSTAT_END_WRITE_ACTIVITY(beentry);
 }
 
+/*
+ * Iterate through BackendStatusArray and capture live backends' stats on IOOps
+ * for all IOPaths, adding them to that backend type's member of the
+ * backend_io_path_ops structure.
+ */
+void
+pgstat_report_live_backend_io_path_ops(PgStatIOPathOps *backend_io_path_ops)
+{
+	PgBackendStatus *beentry = BackendStatusArray;
+
+	/*
+	 * Loop through live backends and capture reset values
+	 */
+	for (int i = 0; i < GetMaxBackends() + NUM_AUXPROCTYPES; i++, beentry++)
+	{
+		int idx;
+
+		/* Don't count dead backends or those with type B_INVALID. */
+		if (beentry->st_procpid == 0 || beentry->st_backendType == B_INVALID)
+			continue;
+
+		idx = backend_type_get_idx(beentry->st_backendType);
+		pgstat_sum_io_path_ops(backend_io_path_ops[idx].io_path_ops,
+				(IOOpCounters *) beentry->io_path_stats);
+	}
+}
+
 /* --------
  * pgstat_report_query_id() -
  *
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index cdb2ce60c46..9e93580f680 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -62,7 +62,7 @@ typedef int64 PgStat_Counter;
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER,
+	RESET_BUFFERS,
 	RESET_WAL
 } PgStat_Shared_Reset_Target;
 
@@ -164,6 +164,11 @@ typedef struct PgStat_TableCounts
 	PgStat_Counter t_blocks_hit;
 } PgStat_TableCounts;
 
+/* ------------------------------------------------------------
+ * Structures kept in backend local memory while accumulating counts
+ * ------------------------------------------------------------
+ */
+
 /* ----------
  * PgStat_TableStatus			Per-table status within a backend
  *
@@ -395,7 +400,8 @@ typedef struct PgStatIOPathOps
 
 /*
  * Sent by a backend to the stats collector to report all IOOps for all IOPaths
- * for a given type of a backend. This will happen when the backend exits.
+ * for a given type of a backend. This will happen when the backend exits or
+ * when stats are reset.
  */
 typedef struct PgStat_MsgIOPathOps
 {
@@ -413,9 +419,12 @@ typedef struct PgStat_MsgIOPathOps
  */
 typedef struct PgStat_BackendIOPathOps
 {
+	TimestampTz stat_reset_timestamp;
 	PgStatIOPathOps ops[BACKEND_NUM_TYPES];
+	PgStatIOPathOps resets[BACKEND_NUM_TYPES];
 } PgStat_BackendIOPathOps;
 
+
 /* ----------
  * PgStat_MsgResetcounter		Sent by the backend to tell the collector
  *								to reset counters
@@ -427,15 +436,28 @@ typedef struct PgStat_MsgResetcounter
 	Oid			m_databaseid;
 } PgStat_MsgResetcounter;
 
-/* ----------
- * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- *								to reset a shared counter
- * ----------
+/*
+ * Sent by the backend to tell the collector to reset a shared counter.
+ *
+ * In addition to the message header and reset target, the message also
+ * contains an array with all of the IO operations for all IO paths done by a
+ * particular backend type.
+ *
+ * This is needed because the IO operation stats for live backends cannot be
+ * safely modified by other processes. Therefore, to correctly calculate the
+ * total IO operations for a particular backend type after a reset, the balance
+ * of IO operations for live backends at the time of prior resets must be
+ * subtracted from the total IO operations.
+ *
+ * To satisfy this requirement, the process initiating the reset will read the
+ * IO operations counters from live backends and send them to the stats
+ * collector which maintains an array of reset values.
  */
 typedef struct PgStat_MsgResetsharedcounter
 {
 	PgStat_MsgHdr m_hdr;
 	PgStat_Shared_Reset_Target m_resettarget;
+	PgStat_MsgIOPathOps m_backend_resets;
 } PgStat_MsgResetsharedcounter;
 
 /* ----------
diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h
index 3de1e7c8d37..7e59c063b94 100644
--- a/src/include/utils/backend_status.h
+++ b/src/include/utils/backend_status.h
@@ -375,6 +375,7 @@ extern void pgstat_bestart(void);
 extern void pgstat_clear_backend_activity_snapshot(void);
 
 /* Activity reporting functions */
+typedef struct PgStatIOPathOps PgStatIOPathOps;
 
 static inline void
 pgstat_inc_ioop(IOOp io_op, IOPath io_path)
@@ -402,6 +403,7 @@ pgstat_inc_ioop(IOOp io_op, IOPath io_path)
 	}
 }
 extern void pgstat_report_activity(BackendState state, const char *cmd_str);
+extern void pgstat_report_live_backend_io_path_ops(PgStatIOPathOps *backend_io_path_ops);
 extern void pgstat_report_query_id(uint64 query_id, bool force);
 extern void pgstat_report_tempfile(size_t filesize);
 extern void pgstat_report_appname(const char *appname);
-- 
2.17.1

