FPW stats?

Started by Dmitry Dolgovover 7 years ago4 messages
#1Dmitry Dolgov
9erthalion6@gmail.com
1 attachment(s)

Hi,

Recently I've heard people complaining that Postgres doesn't expose any
statistics about how many full page writes happened during some time frame.
Indeed, I couldn't find any easy way to do so, and judging from my
understanding of xloginsert.c it actually can be done per database with the
attached poc patch.

I guess it can be implemented in a more effective and optimized way, but with
what I have right now first naive pgbench tests show that slowdown is about 3%.
Before I'll dig into it more, it would be nice to hear your opinion about this
idea - does it make sense to have something like this?

Attachments:

fpw_stat.patchtext/x-patch; charset=US-ASCII; name=fpw_stat.patchDownload
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 5bea073..64e450c 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -31,6 +31,7 @@
 #include "storage/proc.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
+#include "pgstat.h"
 
 /* Buffer size required to store a compressed version of backup block image */
 #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@@ -103,6 +104,15 @@ static int	max_rdatas;			/* allocated size */
 
 static bool begininsert_called = false;
 
+#define FPW_COUNTER_HASH_SIZE 100
+
+typedef struct {
+	Oid				db;
+	PgStat_Counter	counter;
+} FpwCounterEntry;
+
+static HTAB *fpwCounterStatHash = NULL;
+
 /* Memory context to hold the registered buffer and data references. */
 static MemoryContext xloginsert_cxt;
 
@@ -192,7 +202,9 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas)
 void
 XLogResetInsertion(void)
 {
-	int			i;
+	int				i;
+	HASH_SEQ_STATUS fstat;
+	FpwCounterEntry *fpwEntry;
 
 	for (i = 0; i < max_registered_block_id; i++)
 		registered_buffers[i].in_use = false;
@@ -203,6 +215,14 @@ XLogResetInsertion(void)
 	mainrdata_last = (XLogRecData *) &mainrdata_head;
 	curinsert_flags = 0;
 	begininsert_called = false;
+
+
+	hash_seq_init(&fstat, fpwCounterStatHash);
+	while ((fpwEntry = (FpwCounterEntry *) hash_seq_search(&fstat)) != NULL)
+	{
+		pgstat_report_fpw(fpwEntry->db, fpwEntry->counter);
+		fpwEntry->counter = 0;
+	}
 }
 
 /*
@@ -584,7 +604,20 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 		if (include_image)
 		{
 			Page		page = regbuf->page;
+			bool		found = false;
 			uint16		compressed_len = 0;
+			FpwCounterEntry *fpwEntry;
+			Oid			dbOid = regbuf->rnode.dbNode;
+
+			fpwEntry = (FpwCounterEntry *) hash_search(fpwCounterStatHash,
+													   &dbOid, HASH_ENTER, &found);
+			if (!found)
+			{
+				fpwEntry->counter = 0;
+				fpwEntry->db = dbOid;
+			}
+
+			fpwEntry->counter++;
 
 			/*
 			 * The page needs to be backed up, so calculate its hole length
@@ -1055,4 +1088,16 @@ InitXLogInsert(void)
 	if (hdr_scratch == NULL)
 		hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
 											 HEADER_SCRATCH_SIZE);
+
+	if (fpwCounterStatHash == NULL)
+	{
+		HASHCTL hash_ctl;
+		memset(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(Oid);
+		hash_ctl.entrysize = sizeof(FpwCounterEntry);
+
+		fpwCounterStatHash = hash_create("Full page write counter hask",
+										 FPW_COUNTER_HASH_SIZE, &hash_ctl,
+										 HASH_ELEM | HASH_BLOBS | HASH_PREALLOC | HASH_FIXED_SIZE);
+	}
 }
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8cd8bf4..6ee5692 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -822,7 +822,8 @@ CREATE VIEW pg_stat_database AS
             pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
             pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
             pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
-            pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
+            pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset,
+            pg_stat_get_db_fpw(D.oid) AS fpw
     FROM pg_database D;
 
 CREATE VIEW pg_stat_database_conflicts AS
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 084573e..9606bb7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -336,6 +336,7 @@ static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
+static void pgstat_recv_fpw(PgStat_MsgFpw *msg, int len);
 
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
@@ -3210,6 +3211,26 @@ pgstat_report_xact_timestamp(TimestampTz tstamp)
 	pgstat_increment_changecount_after(beentry);
 }
 
+/* --------
+ * pgstat_report_fpw() -
+ *
+ *	Tell the collector about full page writes.
+ * --------
+ */
+void
+pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter)
+{
+	PgStat_MsgFpw msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FPW);
+	msg.m_databaseid = dboid;
+	msg.m_fpw_counter = fpwCounter;
+	pgstat_send(&msg, sizeof(msg));
+}
+
 /* ----------
  * pgstat_read_current_status() -
  *
@@ -4455,6 +4476,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
 					break;
 
+				case PGSTAT_MTYPE_FPW:
+					pgstat_recv_fpw((PgStat_MsgFpw *) &msg, len);
+					break;
+
 				default:
 					break;
 			}
@@ -6197,6 +6222,23 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_fpw() -
+ *
+ *	Process a FPW message.
+ * ----------
+ */
+static void
+pgstat_recv_fpw(PgStat_MsgFpw *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+	dbentry->n_fpw += msg->m_fpw_counter;
+}
+
+
+/* ----------
  * pgstat_recv_tempfile() -
  *
  *	Process a TEMPFILE message.
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 6110e40..7f0dcad 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1504,6 +1504,21 @@ pg_stat_get_db_blk_write_time(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_db_fpw(PG_FUNCTION_ARGS)
+{
+	Oid			dbid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatDBEntry *dbentry;
+
+	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (dbentry->n_fpw);
+
+	PG_RETURN_INT64(result);
+}
+
+Datum
 pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 785e0fa..14c92f8 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -517,6 +517,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	 * space if the caller correctly estimates a small table size.
 	 */
 	if ((flags & HASH_SHARED_MEM) ||
+		(flags & HASH_PREALLOC)   ||
 		nelem < hctl->nelem_alloc)
 	{
 		int			i,
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index f643f56..5c1b9da 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5344,6 +5344,10 @@
   proname => 'pg_stat_get_db_blk_write_time', provolatile => 's',
   proparallel => 'r', prorettype => 'float8', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_blk_write_time' },
+{ oid => '3423', descr => 'statistics: number of full page writes for database',
+  proname => 'pg_stat_get_db_fpw', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => 'oid',
+  prosrc => 'pg_stat_get_db_fpw' },
 { oid => '3195', descr => 'statistics: information about WAL archiver',
   proname => 'pg_stat_get_archiver', proisstrict => 'f', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index be2f592..2d67dc3 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -64,7 +64,8 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
 	PGSTAT_MTYPE_TEMPFILE,
-	PGSTAT_MTYPE_DEADLOCK
+	PGSTAT_MTYPE_DEADLOCK,
+	PGSTAT_MTYPE_FPW
 } StatMsgType;
 
 /* ----------
@@ -530,6 +531,17 @@ typedef struct PgStat_MsgDeadlock
 	Oid			m_databaseid;
 } PgStat_MsgDeadlock;
 
+/* ----------
+ * PgStat_MsgFpw			Sent by the backend to tell the collector
+ *								about a fpw that occurred.
+ * ----------
+ */
+typedef struct PgStat_MsgFpw
+{
+	PgStat_MsgHdr	m_hdr;
+	Oid				m_databaseid;
+	PgStat_Counter	m_fpw_counter;
+} PgStat_MsgFpw;
 
 /* ----------
  * PgStat_Msg					Union over all possible messages.
@@ -595,6 +607,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_deadlocks;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
+	PgStat_Counter n_fpw;
 
 	TimestampTz stat_reset_timestamp;
 	TimestampTz stats_timestamp;	/* time of db stats file update */
@@ -1196,6 +1209,7 @@ extern void pgstat_report_activity(BackendState state, const char *cmd_str);
 extern void pgstat_report_tempfile(size_t filesize);
 extern void pgstat_report_appname(const char *appname);
 extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
+extern void pgstat_report_fpw(Oid dboid, PgStat_Counter fpwCounter);
 extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
 extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 8357faa..5bf04a2 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -94,6 +94,7 @@ typedef struct HASHCTL
 #define HASH_SHARED_MEM 0x0800	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x1000	/* Do not initialize hctl */
 #define HASH_FIXED_SIZE 0x2000	/* Initial size is a hard limit */
+#define HASH_PREALLOC   0x4000	/* Preallocate hashtable */
 
 
 /* max_dsize value to indicate expansible directory */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ae0cd25..acb5bbe 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1811,7 +1811,8 @@ pg_stat_database| SELECT d.oid AS datid,
     pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
     pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
     pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
-    pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
+    pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset,
+    pg_stat_get_db_fpw(d.oid) AS fpw
    FROM pg_database d;
 pg_stat_database_conflicts| SELECT d.oid AS datid,
     d.datname,
#2Michael Paquier
michael@paquier.xyz
In reply to: Dmitry Dolgov (#1)
Re: FPW stats?

On Wed, May 02, 2018 at 12:22:34PM +0200, Dmitry Dolgov wrote:

Recently I've heard people complaining that Postgres doesn't expose any
statistics about how many full page writes happened during some time
frame.

pg_waldump --stats?

I guess it can be implemented in a more effective and optimized way, but with
what I have right now first naive pgbench tests show that slowdown is about 3%.
Before I'll dig into it more, it would be nice to hear your opinion about this
idea - does it make sense to have something like this?

The bar to add new fields into pgstat structures is usually quite high
depending on the location where those are added. For example not so
long ago there was a patch discussed about adding more fields to
PgStat_StatTabEntry, which has been rejected as pgstat can be a problem
for users with many tables. See here:
/messages/by-id/1323.1511624064@sss.pgh.pa.us

Your patch adds a new field to PgStat_StatDBEntry? Wouldn't you
increase the bottleneck of deployments with many databases? What's
actually your use case?
--
Michael

#3Dmitry Dolgov
9erthalion6@gmail.com
In reply to: Michael Paquier (#2)
Re: FPW stats?

On 2 May 2018 at 13:10, Michael Paquier <michael@paquier.xyz> wrote:
On Wed, May 02, 2018 at 12:22:34PM +0200, Dmitry Dolgov wrote:

Recently I've heard people complaining that Postgres doesn't expose any
statistics about how many full page writes happened during some time
frame.

pg_waldump --stats?

Yep, pg_waldump is the only option so far, but I thought about something that
will directly expose this info.

The bar to add new fields into pgstat structures is usually quite high
depending on the location where those are added. For example not so
long ago there was a patch discussed about adding more fields to
PgStat_StatTabEntry, which has been rejected as pgstat can be a problem
for users with many tables. See here:
/messages/by-id/1323.1511624064@sss.pgh.pa.us

Your patch adds a new field to PgStat_StatDBEntry? Wouldn't you
increase the bottleneck of deployments with many databases? What's
actually your use case?

This was discussed mostly in the context of benchmarking and understanding IO
for different workloads. I actually never realized that adding a new stats
field can have significant impact in those cases when there are too many
databases, and yeah, I'm afraid it may be not justified in this context.

#4Robert Haas
robertmhaas@gmail.com
In reply to: Michael Paquier (#2)
Re: FPW stats?

On Wed, May 2, 2018 at 7:10 AM, Michael Paquier <michael@paquier.xyz> wrote:

Your patch adds a new field to PgStat_StatDBEntry? Wouldn't you
increase the bottleneck of deployments with many databases? What's
actually your use case?

I'm a little doubtful about whether this particular thing is generally
useful but I think the bar for adding a field to PgStat_StatDBEntry is
probably a lot lower than for a per-table counter. I think adding
table-level counters is basically not happening without some kind of
rework of the infrastructure; whereas adding db-level counters seems
like it might be OK if we were convinced that they had real value.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company