Recovery conflict monitoring

Started by Magnus Haganderabout 15 years ago7 messages
#1Magnus Hagander
magnus@hagander.net
1 attachment(s)

This patch adds counters and views to monitor hot standby generated
recovery conflicts. It extends the pg_stat_database view with one
column with the total number of conflicts, and also creates a new view
pg_stat_database_conflicts that contains a breakdown of exactly what
caused the conflicts.

Documentation still pending, but comments meanwhile is of course appreciated ;)

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

Attachments:

recovery_conflict_stat.patchtext/x-patch; charset=US-ASCII; name=recovery_conflict_stat.patchDownload
*** a/src/backend/catalog/system_views.sql
--- b/src/backend/catalog/system_views.sql
***************
*** 502,508 **** CREATE VIEW pg_stat_database AS
              pg_stat_get_db_tuples_fetched(D.oid) AS tup_fetched,
              pg_stat_get_db_tuples_inserted(D.oid) AS tup_inserted,
              pg_stat_get_db_tuples_updated(D.oid) AS tup_updated,
!             pg_stat_get_db_tuples_deleted(D.oid) AS tup_deleted
      FROM pg_database D;
  
  CREATE VIEW pg_stat_user_functions AS
--- 502,521 ----
              pg_stat_get_db_tuples_fetched(D.oid) AS tup_fetched,
              pg_stat_get_db_tuples_inserted(D.oid) AS tup_inserted,
              pg_stat_get_db_tuples_updated(D.oid) AS tup_updated,
!             pg_stat_get_db_tuples_deleted(D.oid) AS tup_deleted,
! 	    pg_stat_get_db_conflict_all(D.oid) AS conflicts
!     FROM pg_database D;
! 
! CREATE VIEW pg_stat_database_conflicts AS
!     SELECT
!             D.oid AS datid,
!             D.datname AS datname,
!             pg_stat_get_db_conflict_database(D.oid) AS confl_database,
!             pg_stat_get_db_conflict_tablespace(D.oid) AS confl_tablespace,
!             pg_stat_get_db_conflict_lock(D.oid) AS confl_lock,
!             pg_stat_get_db_conflict_snapshot(D.oid) AS confl_snapshot,
!             pg_stat_get_db_conflict_bufferpin(D.oid) AS confl_bufferpin,
!             pg_stat_get_db_conflict_startup_deadlock(D.oid) AS confl_deadlock
      FROM pg_database D;
  
  CREATE VIEW pg_stat_user_functions AS
*** a/src/backend/postmaster/pgstat.c
--- b/src/backend/postmaster/pgstat.c
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/ipc.h"
  #include "storage/pg_shmem.h"
  #include "storage/pmsignal.h"
+ #include "storage/procsignal.h"
  #include "utils/guc.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
***************
*** 278,283 **** static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
--- 279,285 ----
  static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
  static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
  static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
+ static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
  
  
  /* ------------------------------------------------------------
***************
*** 1314,1319 **** pgstat_report_analyze(Relation rel, bool adopt_counts,
--- 1316,1340 ----
  	pgstat_send(&msg, sizeof(msg));
  }
  
+ /* --------
+  * pgstat_report_recovery_conflict() -
+  *
+  *  Tell the collector about a Hot Standby recovery conflict.
+  * --------
+  */
+ void
+ pgstat_report_recovery_conflict(int reason)
+ {
+ 	PgStat_MsgRecoveryConflict msg;
+ 
+ 	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ 		return;
+ 
+ 	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
+ 	msg.m_databaseid = MyDatabaseId;
+ 	msg.m_reason = reason;
+ 	pgstat_send(&msg, sizeof(msg));
+ }
  
  /* ----------
   * pgstat_ping() -
***************
*** 3053,3058 **** PgstatCollectorMain(int argc, char *argv[])
--- 3074,3083 ----
  					pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
  					break;
  
+ 				case PGSTAT_MTYPE_RECOVERYCONFLICT:
+ 					pgstat_recv_recoveryconflict((PgStat_MsgRecoveryConflict *) &msg, len);
+ 					break;
+ 
  				default:
  					break;
  			}
***************
*** 3129,3134 **** pgstat_get_db_entry(Oid databaseid, bool create)
--- 3154,3165 ----
  		result->n_tuples_updated = 0;
  		result->n_tuples_deleted = 0;
  		result->last_autovac_time = 0;
+ 		result->n_conflict_database = 0;
+ 		result->n_conflict_tablespace = 0;
+ 		result->n_conflict_lock = 0;
+ 		result->n_conflict_snapshot = 0;
+ 		result->n_conflict_bufferpin = 0;
+ 		result->n_conflict_startup_deadlock = 0;
  
  		memset(&hash_ctl, 0, sizeof(hash_ctl));
  		hash_ctl.keysize = sizeof(Oid);
***************
*** 4204,4209 **** pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
--- 4235,4275 ----
  }
  
  /* ----------
+  * pgstat_recv_recoveryconflict() -
+  *
+  *  Process as RECOVERYCONFLICT message.
+  * ----------
+  */
+ static void
+ pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
+ {
+ 	PgStat_StatDBEntry *dbentry;
+ 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+ 
+ 	switch (msg->m_reason)
+ 	{
+ 		case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ 			dbentry->n_conflict_database++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ 			dbentry->n_conflict_tablespace++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ 			dbentry->n_conflict_lock++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ 			dbentry->n_conflict_snapshot++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ 			dbentry->n_conflict_bufferpin++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ 			dbentry->n_conflict_startup_deadlock++;
+ 			break;
+ 	}
+ }
+ 
+ /* ----------
   * pgstat_recv_funcstat() -
   *
   *	Count what the backend has done.
*** a/src/backend/tcop/postgres.c
--- b/src/backend/tcop/postgres.c
***************
*** 2903,2917 **** ProcessInterrupts(void)
--- 2903,2923 ----
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
  					 errmsg("terminating autovacuum process due to administrator command")));
  		else if (RecoveryConflictPending && RecoveryConflictRetryable)
+ 		{
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			ereport(FATAL,
  					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  			  errmsg("terminating connection due to conflict with recovery"),
  					 errdetail_recovery_conflict()));
+ 		}
  		else if (RecoveryConflictPending)
+ 		{
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			ereport(FATAL,
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
  			  errmsg("terminating connection due to conflict with recovery"),
  					 errdetail_recovery_conflict()));
+ 		}
  		else
  			ereport(FATAL,
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
***************
*** 2956,2961 **** ProcessInterrupts(void)
--- 2962,2968 ----
  			RecoveryConflictPending = false;
  			DisableNotifyInterrupt();
  			DisableCatchupInterrupt();
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			if (DoingCommandRead)
  				ereport(FATAL,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
*** a/src/backend/utils/adt/pgstatfuncs.c
--- b/src/backend/utils/adt/pgstatfuncs.c
***************
*** 71,76 **** extern Datum pg_stat_get_db_tuples_fetched(PG_FUNCTION_ARGS);
--- 71,83 ----
  extern Datum pg_stat_get_db_tuples_inserted(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_db_tuples_updated(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_db_tuples_deleted(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_database(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_tablespace(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_lock(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_snapshot(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_bufferpin(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_startup_deadlock(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_all(PG_FUNCTION_ARGS);
  
  extern Datum pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_bgwriter_requested_checkpoints(PG_FUNCTION_ARGS);
***************
*** 1130,1135 **** pg_stat_get_db_tuples_deleted(PG_FUNCTION_ARGS)
--- 1137,1252 ----
  }
  
  Datum
+ pg_stat_get_db_conflict_database(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_database);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_tablespace(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_tablespace);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_lock(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_lock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_snapshot(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_snapshot);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_bufferpin(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_bufferpin);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_startup_deadlock(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_startup_deadlock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_all(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_database +
+ 			dbentry->n_conflict_tablespace +
+ 			dbentry->n_conflict_lock +
+ 			dbentry->n_conflict_snapshot +
+ 			dbentry->n_conflict_bufferpin +
+ 			dbentry->n_conflict_startup_deadlock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
  pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS)
  {
  	PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints);
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 3109,3114 **** DATA(insert OID = 2761 (  pg_stat_get_db_tuples_updated PGNSP PGUID 12 1 0 0 f f
--- 3109,3128 ----
  DESCR("statistics: tuples updated in database");
  DATA(insert OID = 2762 (  pg_stat_get_db_tuples_deleted PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_tuples_deleted _null_ _null_ _null_ ));
  DESCR("statistics: tuples deleted in database");
+ DATA(insert OID = 3065 (  pg_stat_get_db_conflict_database PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_database _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by drop database");
+ DATA(insert OID = 3066 (  pg_stat_get_db_conflict_tablespace PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_tablespace _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by drop tablespace");
+ DATA(insert OID = 3067 (  pg_stat_get_db_conflict_lock PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_lock _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by relation lock");
+ DATA(insert OID = 3068 (  pg_stat_get_db_conflict_snapshot PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_snapshot _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by snapshot expiry");
+ DATA(insert OID = 3069 (  pg_stat_get_db_conflict_bufferpin PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_bufferpin _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by shared buffer pin");
+ DATA(insert OID = 3070 (  pg_stat_get_db_conflict_startup_deadlock PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_startup_deadlock _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by buffer deadlock");
+ DATA(insert OID = 3071 (  pg_stat_get_db_conflict_all PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_all _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database");
  DATA(insert OID = 2769 ( pg_stat_get_bgwriter_timed_checkpoints PGNSP PGUID 12 1 0 0 f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_bgwriter_timed_checkpoints _null_ _null_ _null_ ));
  DESCR("statistics: number of timed checkpoints started by the bgwriter");
  DATA(insert OID = 2770 ( pg_stat_get_bgwriter_requested_checkpoints PGNSP PGUID 12 1 0 0 f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_bgwriter_requested_checkpoints _null_ _null_ _null_ ));
*** a/src/include/pgstat.h
--- b/src/include/pgstat.h
***************
*** 45,51 **** typedef enum StatMsgType
  	PGSTAT_MTYPE_ANALYZE,
  	PGSTAT_MTYPE_BGWRITER,
  	PGSTAT_MTYPE_FUNCSTAT,
! 	PGSTAT_MTYPE_FUNCPURGE
  } StatMsgType;
  
  /* ----------
--- 45,52 ----
  	PGSTAT_MTYPE_ANALYZE,
  	PGSTAT_MTYPE_BGWRITER,
  	PGSTAT_MTYPE_FUNCSTAT,
! 	PGSTAT_MTYPE_FUNCPURGE,
! 	PGSTAT_MTYPE_RECOVERYCONFLICT
  } StatMsgType;
  
  /* ----------
***************
*** 364,369 **** typedef struct PgStat_MsgBgWriter
--- 365,381 ----
  	PgStat_Counter m_buf_alloc;
  } PgStat_MsgBgWriter;
  
+ /* ----------
+  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
+  * ----------
+  */
+ typedef struct PgStat_MsgRecoveryConflict
+ {
+ 	PgStat_MsgHdr m_hdr;
+ 
+ 	Oid			m_databaseid;
+ 	int			m_reason;
+ } PgStat_MsgRecoveryConflict;
  
  /* ----------
   * PgStat_FunctionCounts	The actual per-function counts kept by a backend
***************
*** 460,465 **** typedef union PgStat_Msg
--- 472,478 ----
  	PgStat_MsgBgWriter msg_bgwriter;
  	PgStat_MsgFuncstat msg_funcstat;
  	PgStat_MsgFuncpurge msg_funcpurge;
+ 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
  } PgStat_Msg;
  
  
***************
*** 490,495 **** typedef struct PgStat_StatDBEntry
--- 503,515 ----
  	PgStat_Counter n_tuples_updated;
  	PgStat_Counter n_tuples_deleted;
  	TimestampTz last_autovac_time;
+ 	PgStat_Counter n_conflict_database;
+ 	PgStat_Counter n_conflict_tablespace;
+ 	PgStat_Counter n_conflict_lock;
+ 	PgStat_Counter n_conflict_snapshot;
+ 	PgStat_Counter n_conflict_bufferpin;
+ 	PgStat_Counter n_conflict_startup_deadlock;
+ 
  
  	/*
  	 * tables and functions must be last in the struct, because we don't write
***************
*** 689,694 **** extern void pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
--- 709,716 ----
  extern void pgstat_report_analyze(Relation rel, bool adopt_counts,
  					  PgStat_Counter livetuples, PgStat_Counter deadtuples);
  
+ extern void pgstat_report_recovery_conflict(int reason);
+ 
  extern void pgstat_initialize(void);
  extern void pgstat_bestart(void);
  
#2Magnus Hagander
magnus@hagander.net
In reply to: Magnus Hagander (#1)
1 attachment(s)
Re: Recovery conflict monitoring

On Thu, Dec 23, 2010 at 13:09, Magnus Hagander <magnus@hagander.net> wrote:

This patch adds counters and views to monitor hot standby generated
recovery conflicts. It extends the pg_stat_database view with one
column with the total number of conflicts, and also creates a new view
pg_stat_database_conflicts that contains a breakdown of exactly what
caused the conflicts.

Documentation still pending, but comments meanwhile is of course appreciated ;)

Heikki pointed out over IM that it's pointless to count stats caused
by recovery conflict with drop database - since we drop the stats
record as soon as it arrives anyway. Here's an updated patch that
removes that, and also adds some documentation.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

Attachments:

recovery_conflict_stat.patchtext/x-patch; charset=US-ASCII; name=recovery_conflict_stat.patchDownload
*** a/doc/src/sgml/high-availability.sgml
--- b/doc/src/sgml/high-availability.sgml
***************
*** 1541,1546 **** if (!triggered)
--- 1541,1552 ----
      approach, since <varname>vacuum_defer_cleanup_age</> is measured in
      transactions executed on the primary server.
     </para>
+ 
+    <para>
+     The number of query cancels and the reason for them can be viewed using
+     the <structname>pg_stat_database_conflicts</> system view on the slave
+     server.
+    </para>
    </sect2>
  
    <sect2 id="hot-standby-admin">
*** a/doc/src/sgml/monitoring.sgml
--- b/doc/src/sgml/monitoring.sgml
***************
*** 278,284 **** postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
        number of transactions committed and rolled back in that database,
        total disk blocks read, total buffer hits (i.e., block
        read requests avoided by finding the block already in buffer cache),
!       number of rows returned, fetched, inserted, updated and deleted.
       </entry>
       </row>
  
--- 278,294 ----
        number of transactions committed and rolled back in that database,
        total disk blocks read, total buffer hits (i.e., block
        read requests avoided by finding the block already in buffer cache),
!       number of rows returned, fetched, inserted, updated and deleted, and
!       total number of queries cancelled due to conflict with recovery.
!      </entry>
!      </row>
! 
!      <row>
!       <entry><structname>pg_stat_database_conflicts</><indexterm><primary>pg_stat_database_conflicts</primary></indexterm></entry>
!       <entry>One row per database, showing database OID, database name and
!       the number of queries that have been cancelled in this database due to
!       dropped tablespaces, lock timeouts, old snapshots, pinned buffers and
!       deadlocks.
       </entry>
       </row>
  
***************
*** 600,605 **** postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
--- 610,655 ----
       </row>
  
       <row>
+       <entry><literal><function>pg_stat_get_db_conflict_tablespace</function>(<type>oid</type>)</literal></entry>
+       <entry><type>bigint</type></entry>
+       <entry>
+        Number of queries cancelled because of recovery conflict with dropped tablespaces in database
+       </entry>
+      </row>
+ 
+      <row>
+       <entry><literal><function>pg_stat_get_db_conflict_lock</function>(<type>oid</type>)</literal></entry>
+       <entry><type>bigint</type></entry>
+       <entry>
+        Number of queries cancelled because of recovery conflict with locks in database
+       </entry>
+      </row>
+ 
+      <row>
+       <entry><literal><function>pg_stat_get_db_conflict_snapshot</function>(<type>oid</type>)</literal></entry>
+       <entry><type>bigint</type></entry>
+       <entry>
+        Number of queries cancelled because of recovery conflict with old snapshots in database
+       </entry>
+      </row>
+ 
+      <row>
+       <entry><literal><function>pg_stat_get_db_conflict_bufferpin</function>(<type>oid</type>)</literal></entry>
+       <entry><type>bigint</type></entry>
+       <entry>
+        Number of queries cancelled because of recovery conflict with pinned buffers in database
+       </entry>
+      </row>
+ 
+      <row>
+       <entry><literal><function>pg_stat_get_db_conflict_startup_deadlock</function>(<type>oid</type>)</literal></entry>
+       <entry><type>bigint</type></entry>
+       <entry>
+        Number of queries cancelled because of recovery conflict with deadlocks in database
+       </entry>
+      </row>
+ 
+      <row>
        <entry><literal><function>pg_stat_get_numscans</function>(<type>oid</type>)</literal></entry>
        <entry><type>bigint</type></entry>
        <entry>
*** a/src/backend/catalog/system_views.sql
--- b/src/backend/catalog/system_views.sql
***************
*** 502,508 **** CREATE VIEW pg_stat_database AS
              pg_stat_get_db_tuples_fetched(D.oid) AS tup_fetched,
              pg_stat_get_db_tuples_inserted(D.oid) AS tup_inserted,
              pg_stat_get_db_tuples_updated(D.oid) AS tup_updated,
!             pg_stat_get_db_tuples_deleted(D.oid) AS tup_deleted
      FROM pg_database D;
  
  CREATE VIEW pg_stat_user_functions AS
--- 502,520 ----
              pg_stat_get_db_tuples_fetched(D.oid) AS tup_fetched,
              pg_stat_get_db_tuples_inserted(D.oid) AS tup_inserted,
              pg_stat_get_db_tuples_updated(D.oid) AS tup_updated,
!             pg_stat_get_db_tuples_deleted(D.oid) AS tup_deleted,
! 	    pg_stat_get_db_conflict_all(D.oid) AS conflicts
!     FROM pg_database D;
! 
! CREATE VIEW pg_stat_database_conflicts AS
!     SELECT
!             D.oid AS datid,
!             D.datname AS datname,
!             pg_stat_get_db_conflict_tablespace(D.oid) AS confl_tablespace,
!             pg_stat_get_db_conflict_lock(D.oid) AS confl_lock,
!             pg_stat_get_db_conflict_snapshot(D.oid) AS confl_snapshot,
!             pg_stat_get_db_conflict_bufferpin(D.oid) AS confl_bufferpin,
!             pg_stat_get_db_conflict_startup_deadlock(D.oid) AS confl_deadlock
      FROM pg_database D;
  
  CREATE VIEW pg_stat_user_functions AS
*** a/src/backend/postmaster/pgstat.c
--- b/src/backend/postmaster/pgstat.c
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/ipc.h"
  #include "storage/pg_shmem.h"
  #include "storage/pmsignal.h"
+ #include "storage/procsignal.h"
  #include "utils/guc.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
***************
*** 278,283 **** static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
--- 279,285 ----
  static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
  static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
  static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
+ static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
  
  
  /* ------------------------------------------------------------
***************
*** 1314,1319 **** pgstat_report_analyze(Relation rel, bool adopt_counts,
--- 1316,1340 ----
  	pgstat_send(&msg, sizeof(msg));
  }
  
+ /* --------
+  * pgstat_report_recovery_conflict() -
+  *
+  *  Tell the collector about a Hot Standby recovery conflict.
+  * --------
+  */
+ void
+ pgstat_report_recovery_conflict(int reason)
+ {
+ 	PgStat_MsgRecoveryConflict msg;
+ 
+ 	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+ 		return;
+ 
+ 	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
+ 	msg.m_databaseid = MyDatabaseId;
+ 	msg.m_reason = reason;
+ 	pgstat_send(&msg, sizeof(msg));
+ }
  
  /* ----------
   * pgstat_ping() -
***************
*** 3053,3058 **** PgstatCollectorMain(int argc, char *argv[])
--- 3074,3083 ----
  					pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
  					break;
  
+ 				case PGSTAT_MTYPE_RECOVERYCONFLICT:
+ 					pgstat_recv_recoveryconflict((PgStat_MsgRecoveryConflict *) &msg, len);
+ 					break;
+ 
  				default:
  					break;
  			}
***************
*** 3129,3134 **** pgstat_get_db_entry(Oid databaseid, bool create)
--- 3154,3164 ----
  		result->n_tuples_updated = 0;
  		result->n_tuples_deleted = 0;
  		result->last_autovac_time = 0;
+ 		result->n_conflict_tablespace = 0;
+ 		result->n_conflict_lock = 0;
+ 		result->n_conflict_snapshot = 0;
+ 		result->n_conflict_bufferpin = 0;
+ 		result->n_conflict_startup_deadlock = 0;
  
  		memset(&hash_ctl, 0, sizeof(hash_ctl));
  		hash_ctl.keysize = sizeof(Oid);
***************
*** 4204,4209 **** pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
--- 4234,4278 ----
  }
  
  /* ----------
+  * pgstat_recv_recoveryconflict() -
+  *
+  *  Process as RECOVERYCONFLICT message.
+  * ----------
+  */
+ static void
+ pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
+ {
+ 	PgStat_StatDBEntry *dbentry;
+ 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+ 
+ 	switch (msg->m_reason)
+ 	{
+ 		case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ 			/*
+ 			 * Since we drop the information about the database as soon
+ 			 * as it replicates, there is no point in counting these
+ 			 * conflicts.
+ 			 */
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ 			dbentry->n_conflict_tablespace++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ 			dbentry->n_conflict_lock++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ 			dbentry->n_conflict_snapshot++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ 			dbentry->n_conflict_bufferpin++;
+ 			break;
+ 		case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ 			dbentry->n_conflict_startup_deadlock++;
+ 			break;
+ 	}
+ }
+ 
+ /* ----------
   * pgstat_recv_funcstat() -
   *
   *	Count what the backend has done.
*** a/src/backend/tcop/postgres.c
--- b/src/backend/tcop/postgres.c
***************
*** 2903,2917 **** ProcessInterrupts(void)
--- 2903,2923 ----
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
  					 errmsg("terminating autovacuum process due to administrator command")));
  		else if (RecoveryConflictPending && RecoveryConflictRetryable)
+ 		{
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			ereport(FATAL,
  					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  			  errmsg("terminating connection due to conflict with recovery"),
  					 errdetail_recovery_conflict()));
+ 		}
  		else if (RecoveryConflictPending)
+ 		{
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			ereport(FATAL,
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
  			  errmsg("terminating connection due to conflict with recovery"),
  					 errdetail_recovery_conflict()));
+ 		}
  		else
  			ereport(FATAL,
  					(errcode(ERRCODE_ADMIN_SHUTDOWN),
***************
*** 2956,2961 **** ProcessInterrupts(void)
--- 2962,2968 ----
  			RecoveryConflictPending = false;
  			DisableNotifyInterrupt();
  			DisableCatchupInterrupt();
+ 			pgstat_report_recovery_conflict(RecoveryConflictReason);
  			if (DoingCommandRead)
  				ereport(FATAL,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
*** a/src/backend/utils/adt/pgstatfuncs.c
--- b/src/backend/utils/adt/pgstatfuncs.c
***************
*** 71,76 **** extern Datum pg_stat_get_db_tuples_fetched(PG_FUNCTION_ARGS);
--- 71,82 ----
  extern Datum pg_stat_get_db_tuples_inserted(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_db_tuples_updated(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_db_tuples_deleted(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_tablespace(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_lock(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_snapshot(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_bufferpin(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_startup_deadlock(PG_FUNCTION_ARGS);
+ extern Datum pg_stat_get_db_conflict_all(PG_FUNCTION_ARGS);
  
  extern Datum pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS);
  extern Datum pg_stat_get_bgwriter_requested_checkpoints(PG_FUNCTION_ARGS);
***************
*** 1130,1135 **** pg_stat_get_db_tuples_deleted(PG_FUNCTION_ARGS)
--- 1136,1236 ----
  }
  
  Datum
+ pg_stat_get_db_conflict_tablespace(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_tablespace);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_lock(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_lock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_snapshot(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_snapshot);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_bufferpin(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_bufferpin);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_startup_deadlock(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (dbentry->n_conflict_startup_deadlock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
+ pg_stat_get_db_conflict_all(PG_FUNCTION_ARGS)
+ {
+ 	Oid			dbid = PG_GETARG_OID(0);
+ 	int64		result;
+ 	PgStat_StatDBEntry *dbentry;
+ 
+ 	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+ 		result = 0;
+ 	else
+ 		result = (int64) (
+ 			dbentry->n_conflict_tablespace +
+ 			dbentry->n_conflict_lock +
+ 			dbentry->n_conflict_snapshot +
+ 			dbentry->n_conflict_bufferpin +
+ 			dbentry->n_conflict_startup_deadlock);
+ 
+ 	PG_RETURN_INT64(result);
+ }
+ 
+ Datum
  pg_stat_get_bgwriter_timed_checkpoints(PG_FUNCTION_ARGS)
  {
  	PG_RETURN_INT64(pgstat_fetch_global()->timed_checkpoints);
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 3109,3114 **** DATA(insert OID = 2761 (  pg_stat_get_db_tuples_updated PGNSP PGUID 12 1 0 0 f f
--- 3109,3126 ----
  DESCR("statistics: tuples updated in database");
  DATA(insert OID = 2762 (  pg_stat_get_db_tuples_deleted PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_tuples_deleted _null_ _null_ _null_ ));
  DESCR("statistics: tuples deleted in database");
+ DATA(insert OID = 3065 (  pg_stat_get_db_conflict_tablespace PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_tablespace _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by drop tablespace");
+ DATA(insert OID = 3066 (  pg_stat_get_db_conflict_lock PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_lock _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by relation lock");
+ DATA(insert OID = 3067 (  pg_stat_get_db_conflict_snapshot PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_snapshot _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by snapshot expiry");
+ DATA(insert OID = 3068 (  pg_stat_get_db_conflict_bufferpin PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_bufferpin _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by shared buffer pin");
+ DATA(insert OID = 3069 (  pg_stat_get_db_conflict_startup_deadlock PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_startup_deadlock _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database caused by buffer deadlock");
+ DATA(insert OID = 3070 (  pg_stat_get_db_conflict_all PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_db_conflict_all _null_ _null_ _null_ ));
+ DESCR("statistics: recovery conflicts in database");
  DATA(insert OID = 2769 ( pg_stat_get_bgwriter_timed_checkpoints PGNSP PGUID 12 1 0 0 f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_bgwriter_timed_checkpoints _null_ _null_ _null_ ));
  DESCR("statistics: number of timed checkpoints started by the bgwriter");
  DATA(insert OID = 2770 ( pg_stat_get_bgwriter_requested_checkpoints PGNSP PGUID 12 1 0 0 f f f t f s 0 0 20 "" _null_ _null_ _null_ _null_ pg_stat_get_bgwriter_requested_checkpoints _null_ _null_ _null_ ));
*** a/src/include/pgstat.h
--- b/src/include/pgstat.h
***************
*** 45,51 **** typedef enum StatMsgType
  	PGSTAT_MTYPE_ANALYZE,
  	PGSTAT_MTYPE_BGWRITER,
  	PGSTAT_MTYPE_FUNCSTAT,
! 	PGSTAT_MTYPE_FUNCPURGE
  } StatMsgType;
  
  /* ----------
--- 45,52 ----
  	PGSTAT_MTYPE_ANALYZE,
  	PGSTAT_MTYPE_BGWRITER,
  	PGSTAT_MTYPE_FUNCSTAT,
! 	PGSTAT_MTYPE_FUNCPURGE,
! 	PGSTAT_MTYPE_RECOVERYCONFLICT
  } StatMsgType;
  
  /* ----------
***************
*** 364,369 **** typedef struct PgStat_MsgBgWriter
--- 365,381 ----
  	PgStat_Counter m_buf_alloc;
  } PgStat_MsgBgWriter;
  
+ /* ----------
+  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
+  * ----------
+  */
+ typedef struct PgStat_MsgRecoveryConflict
+ {
+ 	PgStat_MsgHdr m_hdr;
+ 
+ 	Oid			m_databaseid;
+ 	int			m_reason;
+ } PgStat_MsgRecoveryConflict;
  
  /* ----------
   * PgStat_FunctionCounts	The actual per-function counts kept by a backend
***************
*** 460,465 **** typedef union PgStat_Msg
--- 472,478 ----
  	PgStat_MsgBgWriter msg_bgwriter;
  	PgStat_MsgFuncstat msg_funcstat;
  	PgStat_MsgFuncpurge msg_funcpurge;
+ 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
  } PgStat_Msg;
  
  
***************
*** 490,495 **** typedef struct PgStat_StatDBEntry
--- 503,514 ----
  	PgStat_Counter n_tuples_updated;
  	PgStat_Counter n_tuples_deleted;
  	TimestampTz last_autovac_time;
+ 	PgStat_Counter n_conflict_tablespace;
+ 	PgStat_Counter n_conflict_lock;
+ 	PgStat_Counter n_conflict_snapshot;
+ 	PgStat_Counter n_conflict_bufferpin;
+ 	PgStat_Counter n_conflict_startup_deadlock;
+ 
  
  	/*
  	 * tables and functions must be last in the struct, because we don't write
***************
*** 689,694 **** extern void pgstat_report_vacuum(Oid tableoid, bool shared, bool adopt_counts,
--- 708,715 ----
  extern void pgstat_report_analyze(Relation rel, bool adopt_counts,
  					  PgStat_Counter livetuples, PgStat_Counter deadtuples);
  
+ extern void pgstat_report_recovery_conflict(int reason);
+ 
  extern void pgstat_initialize(void);
  extern void pgstat_bestart(void);
  
#3Simon Riggs
simon@2ndQuadrant.com
In reply to: Magnus Hagander (#2)
Re: Recovery conflict monitoring

On Mon, 2010-12-27 at 14:39 +0100, Magnus Hagander wrote:

On Thu, Dec 23, 2010 at 13:09, Magnus Hagander <magnus@hagander.net> wrote:

This patch adds counters and views to monitor hot standby generated
recovery conflicts. It extends the pg_stat_database view with one
column with the total number of conflicts, and also creates a new view
pg_stat_database_conflicts that contains a breakdown of exactly what
caused the conflicts.

Documentation still pending, but comments meanwhile is of course appreciated ;)

Heikki pointed out over IM that it's pointless to count stats caused
by recovery conflict with drop database - since we drop the stats
record as soon as it arrives anyway. Here's an updated patch that
removes that, and also adds some documentation.

I like the patch, well inspired, code in the right places AFAICS. No
code comments at all.

Couple of thoughts:

* are we safe to issue stats immediately before issuing FATAL? Won't
some of them get lost?

* Not clear what I'd do with database level information, except worry a
lot. Maybe an option to count conflicts per user would be better, since
at least we'd know exactly who was affected by those. Just an idea.

* Would it better to have a log_standby_conflicts that allowed the
opportunity to log the conflicting SQL, duration until cancelation etc?

I'd rather have what you have than nothing at all though... the new
hot_standby_feedback mode should be acting to reduce these, so it would
be useful to have this patch enabled for testing that feature.

--
Simon Riggs http://www.2ndQuadrant.com/books/
PostgreSQL Development, 24x7 Support, Training and Services

#4Magnus Hagander
magnus@hagander.net
In reply to: Simon Riggs (#3)
Re: Recovery conflict monitoring

On Mon, Jan 3, 2011 at 00:23, Simon Riggs <simon@2ndquadrant.com> wrote:

On Mon, 2010-12-27 at 14:39 +0100, Magnus Hagander wrote:

On Thu, Dec 23, 2010 at 13:09, Magnus Hagander <magnus@hagander.net> wrote:

This patch adds counters and views to monitor hot standby generated
recovery conflicts. It extends the pg_stat_database view with one
column with the total number of conflicts, and also creates a new view
pg_stat_database_conflicts that contains a breakdown of exactly what
caused the conflicts.

Documentation still pending, but comments meanwhile is of course appreciated ;)

Heikki pointed out over IM that it's pointless to count stats caused
by recovery conflict with drop database - since we drop the stats
record as soon as it arrives anyway. Here's an updated patch that
removes that, and also adds some documentation.

I like the patch, well inspired, code in the right places AFAICS. No
code comments at all.

Thanks for reviewing!

Couple of thoughts:

* are we safe to issue stats immediately before issuing FATAL? Won't
some of them get lost?

They shouldn't - not more than other stats messages. Those are often
flushed from on_shmem_exit() which I think runs even later.

* Not clear what I'd do with database level information, except worry a
lot. Maybe an option to count conflicts per user would be better, since
at least we'd know exactly who was affected by those. Just an idea.

Depends on the usage scenario. In a lot of dedicated environments you
really only have one database - but there are many environments where
you do have multiple and it's quite useful to see them separately. And
you can of course very easily sum() them up for a total count, since
it's a view... Better keep the detail than throw it away, even if that
part isn't useful in *all* cases...

Grouping by user would potentially be helpful - I agree. However, that
goes for most pgstat counters ("number of seqscans", "tuples read" etc
are interesting per user as well in some cases). So doing that right
would mean adding per-user tracking all across pgstats in some smart
way - something we don't do now at all. So I see that as a separate
issue.

* Would it better to have a log_standby_conflicts that allowed the
opportunity to log the conflicting SQL, duration until cancelation etc?

Logging is useful to figure out why you have a certain scenario, yes.
But absolutely not as a *replacement* for the statistics counters, but
as an addition. Just like we have (the now incorrectly named)
pg_stat_bgwriter view *and* log_checkpoints... Different usecases for
the same basic information.

I'd rather have what you have than nothing at all though... the new
hot_standby_feedback mode should be acting to reduce these, so it would
be useful to have this patch enabled for testing that feature.

It will help reduce it, but not take it away, right? Plus, it's an
optional feature...

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#5Simon Riggs
simon@2ndQuadrant.com
In reply to: Magnus Hagander (#4)
Re: Recovery conflict monitoring

On Mon, 2011-01-03 at 10:03 +0100, Magnus Hagander wrote:

I like the patch, well inspired, code in the right places AFAICS. No
code comments at all.

Thanks for reviewing!

All good here. Test and commit please.

--
Simon Riggs http://www.2ndQuadrant.com/books/
PostgreSQL Development, 24x7 Support, Training and Services

#6Greg Smith
greg@2ndquadrant.com
In reply to: Magnus Hagander (#2)
Re: Recovery conflict monitoring

Couple of doc suggestions:

--- doc/src/sgml/high-availability.sgml
+     The number of query cancels and the reason for them can be viewed 
using
+     the <structname>pg_stat_database_conflicts</> system view on the slave
+     server.

For compleness sake, this should also mention the per-database summary,
even though I'm not sure how valuable that view is. Also, "on a standby
server" instead of "on the slave server" here. "slave" is mentioned
once as a synonym in high-availability.sgml once, but that's it, and
there can be more than one standby you want to pull these stats from.

*** doc/src/sgml/monitoring.sgml
! number of rows returned, fetched, inserted, updated and deleted, and
! total number of queries cancelled due to conflict with recovery.

This would be clearer if it said you're talking about standby recovery
here, and possibly that this info is only available on the standby. I
could see someone reading this and thinking it's possible for general
database crash recovery to produce cancelled queries, instead of the way
connections are actually blocked until that's done.

! <entry><structname>pg_stat_database_conflicts</>
! <entry>One row per database, showing database OID, database name and
! the number of queries that have been cancelled in this database
due to
! dropped tablespaces, lock timeouts, old snapshots, pinned
buffers and
! deadlocks.

A clarification that you're talking about standby query cancellation
here might be helpful too. I don't think that's necessary for all of
the detailed pg_stat_get_* functions that regular users are less likely
to care about, just these higher level ones.

--
Greg Smith 2ndQuadrant US greg@2ndQuadrant.com Baltimore, MD
PostgreSQL Training, Services and Support www.2ndQuadrant.us
"PostgreSQL 9.0 High Performance": http://www.2ndQuadrant.com/books

#7Magnus Hagander
magnus@hagander.net
In reply to: Greg Smith (#6)
Re: Recovery conflict monitoring

On Mon, Jan 3, 2011 at 11:35, Greg Smith <greg@2ndquadrant.com> wrote:

Couple of doc suggestions:

--- doc/src/sgml/high-availability.sgml
+     The number of query cancels and the reason for them can be viewed
using
+     the <structname>pg_stat_database_conflicts</> system view on the slave
+     server.

For compleness sake, this should also mention the per-database summary, even
though I'm not sure how valuable that view is.  Also, "on a standby server"
instead of "on the slave server" here.  "slave" is mentioned once as a
synonym in high-availability.sgml once, but that's it, and there can be more
than one standby you want to pull these stats from.

Good point, changed and added.

*** doc/src/sgml/monitoring.sgml
!       number of rows returned, fetched, inserted, updated and deleted, and
!       total number of queries cancelled due to conflict with recovery.

This would be clearer if it said you're talking about standby recovery here,
and possibly that this info is only available on the standby.  I could see
someone reading this and thinking it's possible for general database crash
recovery to produce cancelled queries, instead of the way connections are
actually blocked until that's done.

!       <entry><structname>pg_stat_database_conflicts</>
!       <entry>One row per database, showing database OID, database name and
!       the number of queries that have been cancelled in this database due
to
!       dropped tablespaces, lock timeouts, old snapshots, pinned buffers
and
!       deadlocks.

A clarification that you're talking about standby query cancellation here
might be helpful too.  I don't think that's necessary for all of the
detailed pg_stat_get_* functions that regular users are less likely to care
about, just these higher level ones.

Yeah, those both make sense - I've updated the docs and am running tests ATM.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/