[Proposal] Add accumulated statistics for wait event

Started by 임명규over 7 years ago68 messages
#1임명규
myungkyu.lim@samsung.com
1 attachment(s)

Attachments:

wait_event_stat_patchfile.diffapplication/octet-streamDownload
diff --git configure configure
index f891914..5dcebdb 100755
--- configure
+++ configure
@@ -654,6 +654,7 @@ LIBOBJS
 UUID_LIBS
 LDAP_LIBS_BE
 LDAP_LIBS_FE
+with_wait_event_detail
 PTHREAD_CFLAGS
 PTHREAD_LIBS
 PTHREAD_CC
@@ -864,6 +865,7 @@ with_libxslt
 with_system_tzdata
 with_zlib
 with_gnu_ld
+with_wait_event_detail
 enable_largefile
 enable_float4_byval
 enable_float8_byval
@@ -1566,6 +1568,8 @@ Optional Packages:
                           use system time zone data in DIR
   --without-zlib          do not use Zlib
   --with-gnu-ld           assume the C compiler uses GNU ld [default=no]
+  --with-wait-event-detail
+                          build with Wait Event Detail support
 
 Some influential environment variables:
   CC          C compiler command
@@ -10553,6 +10557,41 @@ fi # fi
 
 
 
+#
+# Wait Event Detail Information
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with Wait Event Detail support" >&5
+$as_echo_n "checking whether to build with Wait Event Detail support... " >&6; }
+
+
+
+# Check whether --with-wait-event-detail was given.
+if test "${with_wait_event_detail+set}" = set; then :
+  withval=$with_wait_event_detail;
+  case $withval in
+    yes)
+
+$as_echo "#define USE_WAIT_EVENT_DETAIL 1" >>confdefs.h
+
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-wait-event-detail option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_wait_event_detail=no
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_wait_event_detail" >&5
+$as_echo "$with_wait_event_detail" >&6; }
+
+
 
 ##
 ## Libraries
diff --git configure.in configure.in
index 5712419..861ad6d 100644
--- configure.in
+++ configure.in
@@ -1086,6 +1086,15 @@ PTHREAD_LIBS=
 AC_SUBST(PTHREAD_CFLAGS)
 AC_SUBST(PTHREAD_LIBS)
 
+#
+# Wait Event Detail Information
+#
+AC_MSG_CHECKING([whether to build with Wait Event Detail support])
+PGAC_ARG_BOOL(with, wait-event-detail, no, [build with Wait Event Detail support],
+              [AC_DEFINE([USE_WAIT_EVENT_DETAIL], 1, [Define to build with Wait Event Detail support. (--with-wait-event-detail)])])
+AC_MSG_RESULT([$with_wait_event_detail])
+AC_SUBST(with_wait_event_detail)
+
 
 ##
 ## Libraries
diff --git src/backend/postmaster/pgstat.c src/backend/postmaster/pgstat.c
index bbe7361..e9ed305 100644
--- src/backend/postmaster/pgstat.c
+++ src/backend/postmaster/pgstat.c
@@ -337,6 +337,10 @@ static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int le
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+#ifdef USE_WAIT_EVENT_DETAIL
+static int	pgstat_get_wait_event_array_index(uint32 wait_event_info);
+#endif
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -2930,6 +2934,14 @@ pgstat_bestart(void)
 	beentry->st_progress_command = PROGRESS_COMMAND_INVALID;
 	beentry->st_progress_command_target = InvalidOid;
 
+#ifdef USE_WAIT_EVENT_DETAIL
+	/* additional statistics for wait events */
+	beentry->st_wait_event_start_timestamp = 0;
+	MemSet(beentry->st_wait_event_total_elapsed, 0, sizeof(beentry->st_wait_event_total_elapsed));
+	MemSet(beentry->st_wait_event_max_elapsed, 0, sizeof(beentry->st_wait_event_max_elapsed));
+	MemSet(beentry->st_wait_event_counting, 0, sizeof(beentry->st_wait_event_counting));
+#endif
+
 	/*
 	 * we don't zero st_progress_param here to save cycles; nobody should
 	 * examine it until st_progress_command has been set to something other
@@ -6380,3 +6392,209 @@ pgstat_clip_activity(const char *raw_activity)
 
 	return activity;
 }
+
+#ifdef USE_WAIT_EVENT_DETAIL
+/*
+ * Only called from pgstat_report_wait_start inline function
+ * (see pgstat_report_wait_start()'s documentation)
+ */
+void
+pgstat_report_wait_event_detail_start(uint32 wait_event_info)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+	TimestampTz current_timestamp;
+
+	if (!beentry || !wait_event_info || beentry->st_state != STATE_RUNNING)
+		return;
+
+	current_timestamp = GetCurrentTimestamp();
+	pgstat_increment_changecount_before(beentry);
+	beentry->st_wait_event_start_timestamp = current_timestamp;
+	pgstat_increment_changecount_after(beentry);
+}
+
+/*
+ * Only called from pgstat_report_wait_end inline function
+ * (see pgstat_report_wait_end()'s documentation)
+ */
+void
+pgstat_report_wait_event_detail_end(uint32 wait_event_info)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+	TimestampTz current_timestamp;
+	uint64		elapsed;
+	int			arrayIndex;
+
+	if (!beentry || !wait_event_info || beentry->st_state != STATE_RUNNING)
+		return;
+
+	current_timestamp = GetCurrentTimestamp();
+	elapsed = current_timestamp - beentry->st_wait_event_start_timestamp;
+	arrayIndex = pgstat_get_wait_event_array_index(wait_event_info);
+
+	pgstat_increment_changecount_before(beentry);
+	beentry->st_wait_event_start_timestamp = 0;
+	beentry->st_wait_event_total_elapsed[arrayIndex] += elapsed;
+	beentry->st_wait_event_max_elapsed[arrayIndex] = Max(beentry->st_wait_event_max_elapsed[arrayIndex], elapsed);
+	beentry->st_wait_event_counting[arrayIndex]++;
+	pgstat_increment_changecount_after(beentry);
+}
+
+/*
+ * Convert a wait_event_info number to
+ * PgBackendStatus's wait_event additional information arrays index
+ * (see PgBackendStatus.st_wait_event_*'s documentation)
+ */
+static int
+pgstat_get_wait_event_array_index(uint32 wait_event_info)
+{
+	uint32		classId;
+	uint16		eventId;
+	int			arrayIndex = 0;
+
+	classId = wait_event_info & 0xFF000000;
+	eventId = wait_event_info & 0x0000FFFF;
+
+	switch (classId)
+	{
+		case PG_WAIT_LWLOCK:
+			arrayIndex = eventId;
+			break;
+		case PG_WAIT_LOCK:
+			arrayIndex = NUM_WAIT_LWLOCK + eventId;
+			break;
+		case PG_WAIT_BUFFER_PIN:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + eventId;
+			break;
+		case PG_WAIT_ACTIVITY:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN + eventId;
+			break;
+		case PG_WAIT_CLIENT:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+				NUM_WAIT_ACTIVITY + eventId;
+			break;
+		case PG_WAIT_EXTENSION:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+				NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT + eventId;
+			break;
+		case PG_WAIT_IPC:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+				NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+				NUM_WAIT_EXTENSION + eventId;
+			break;
+		case PG_WAIT_TIMEOUT:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+				NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+				NUM_WAIT_EXTENSION + NUM_WAIT_IPC + eventId;
+			break;
+		case PG_WAIT_IO:
+			arrayIndex = NUM_WAIT_LWLOCK +
+				NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+				NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+				NUM_WAIT_EXTENSION + NUM_WAIT_IPC +
+				NUM_WAIT_TIMEOUT + eventId;
+			break;
+		default:
+			arrayIndex = 0;
+			break;
+	}
+
+	return arrayIndex;
+}
+
+/*
+ * Convert a PgBackendStatus's wait_event additional information arrays index to
+ * wait_event_info number
+ * (see PgBackendStatus.st_wait_event_*'s documentation)
+ */
+uint32
+pgstat_get_wait_event_info(int wait_event_array_index)
+{
+	uint32		wait_event_info = 0;
+
+	if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+		NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+		NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+		NUM_WAIT_EXTENSION + NUM_WAIT_IPC +
+		NUM_WAIT_TIMEOUT)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+								   NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+								   NUM_WAIT_EXTENSION + NUM_WAIT_IPC +
+								   NUM_WAIT_TIMEOUT);
+		wait_event_info = (PG_WAIT_IO | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+			 NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+			 NUM_WAIT_EXTENSION + NUM_WAIT_IPC)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+								   NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+								   NUM_WAIT_EXTENSION + NUM_WAIT_IPC);
+		wait_event_info = (PG_WAIT_TIMEOUT | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+			 NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+			 NUM_WAIT_EXTENSION)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+								   NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+								   NUM_WAIT_EXTENSION);
+		wait_event_info = (PG_WAIT_IPC | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+			 NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+								   NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT);
+		wait_event_info = (PG_WAIT_EXTENSION | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+			 NUM_WAIT_ACTIVITY)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+								   NUM_WAIT_ACTIVITY);
+		wait_event_info = (PG_WAIT_CLIENT | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN);
+		wait_event_info = (PG_WAIT_ACTIVITY | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK +
+			 NUM_WAIT_LOCK)
+	{
+		wait_event_array_index -= (NUM_WAIT_LWLOCK +
+								   NUM_WAIT_LOCK);
+		wait_event_info = (PG_WAIT_BUFFER_PIN | wait_event_array_index);
+	}
+	else if (wait_event_array_index >= NUM_WAIT_LWLOCK)
+	{
+		wait_event_array_index -= NUM_WAIT_LWLOCK;
+		wait_event_info = (PG_WAIT_LOCK | wait_event_array_index);
+	}
+	else
+	{
+		wait_event_info = (PG_WAIT_LWLOCK | wait_event_array_index);
+	}
+
+	return wait_event_info;
+}
+#endif
diff --git src/backend/utils/adt/pgstatfuncs.c src/backend/utils/adt/pgstatfuncs.c
index e95e347..9d8e71f 100644
--- src/backend/utils/adt/pgstatfuncs.c
+++ src/backend/utils/adt/pgstatfuncs.c
@@ -868,6 +868,152 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 	return (Datum) 0;
 }
 
+/*
+ * Returns wait event additional statistics of PG backends.
+ */
+Datum
+pg_stat_get_wait_events(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAIT_EVENT_COLS	6
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+#ifdef USE_WAIT_EVENT_DETAIL
+	{
+		int			num_backends = pgstat_fetch_stat_numbackends();
+		int			curr_backend;
+		int			pid = PG_ARGISNULL(0) ? -1 : PG_GETARG_INT32(0);
+
+		/* 1-based index */
+		for (curr_backend = 1; curr_backend <= num_backends; curr_backend++)
+		{
+			int			i;
+
+			/* for each row */
+			Datum		values[PG_STAT_GET_WAIT_EVENT_COLS];
+			bool		nulls[PG_STAT_GET_WAIT_EVENT_COLS];
+			LocalPgBackendStatus *local_beentry;
+			PgBackendStatus *beentry;
+
+			MemSet(values, 0, sizeof(values));
+			MemSet(nulls, 0, sizeof(nulls));
+			/* Get the next one in the list */
+			local_beentry = pgstat_fetch_stat_local_beentry(curr_backend);
+			if (!local_beentry)
+			{
+				/* Ignore missing entries if looking for specific PID */
+				if (pid != -1)
+					continue;
+
+				for (i = 0; i < lengthof(nulls); i++)
+					nulls[i] = true;
+
+				nulls[1] = false;
+				values[1] = CStringGetTextDatum("<backend information not available>");
+
+				tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+				continue;
+			}
+
+			beentry = &local_beentry->backendStatus;
+
+			/* If looking for specific PID, ignore all the others */
+			if (pid != -1 && beentry->st_procpid != pid)
+				continue;
+
+			/* Values only available to role member or pg_read_all_stats */
+			if (has_privs_of_role(GetUserId(), beentry->st_userid) ||
+				is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS))
+			{
+				int			i;
+				uint32		wait_event_info;
+
+				for (i = 0; i < NUM_WAIT_EVENT; i++)
+				{
+					values[0] = Int32GetDatum(beentry->st_procpid);
+
+					wait_event_info = pgstat_get_wait_event_info(i);
+
+					values[1] = CStringGetTextDatum(pgstat_get_wait_event_type(wait_event_info));
+					values[2] = CStringGetTextDatum(pgstat_get_wait_event(wait_event_info));
+					values[3] = UInt64GetDatum(beentry->st_wait_event_total_elapsed[i]);
+					values[4] = UInt64GetDatum(beentry->st_wait_event_max_elapsed[i]);
+					values[5] = UInt32GetDatum(beentry->st_wait_event_counting[i]);
+
+					tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+				}
+			}
+			else
+			{
+				values[0] = Int32GetDatum(beentry->st_procpid);
+
+				/* No permissions to view data about this session */
+				values[1] = CStringGetTextDatum("<insufficient privilege>");
+				nulls[2] = true;
+				nulls[3] = true;
+				nulls[4] = true;
+				nulls[5] = true;
+
+				tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+			}
+
+			/* If only a single backend was requested, and we found it, break. */
+			if (pid != -1)
+				break;
+		}
+	}
+#else							/* USE_WAIT_EVENT_DETAIL */
+	{
+		/* for each row */
+		Datum		values[PG_STAT_GET_WAIT_EVENT_COLS];
+		bool		nulls[PG_STAT_GET_WAIT_EVENT_COLS];
+		int			i;
+
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+		for (i = 0; i < lengthof(nulls); i++)
+			nulls[i] = true;
+
+		nulls[1] = false;
+		values[1] = CStringGetTextDatum("<wait event information not available>");
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+#endif							/* USE_WAIT_EVENT_DETAIL */
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
+
 
 Datum
 pg_backend_pid(PG_FUNCTION_ARGS)
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index a146510..d6997ee 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -5511,6 +5511,16 @@
   prorettype => 'void', proargtypes => 'oid',
   prosrc => 'pg_stat_reset_single_function_counters' },
 
+{ oid => '3423',
+  descr => 'statistics: information about currently active backends wait events additional statistics',
+  proname => 'pg_stat_get_wait_events', prorows => '100', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,int4,text,text,int8,int8,int4}',
+  proargmodes => '{i,o,o,o,o,o,o}',
+  proargnames => '{pid,pid,wait_event_type,wait_event,total_elapsed,max_elapsed,counting}',
+  prosrc => 'pg_stat_get_wait_events' },
+
 { oid => '3163', descr => 'current trigger depth',
   proname => 'pg_trigger_depth', provolatile => 's', proparallel => 'r',
   prorettype => 'int4', proargtypes => '', prosrc => 'pg_trigger_depth' },
diff --git src/include/pg_config.h.in src/include/pg_config.h.in
index f9fb92f..895e49c 100644
--- src/include/pg_config.h.in
+++ src/include/pg_config.h.in
@@ -941,6 +941,10 @@
 /* Define to select unnamed POSIX semaphores. */
 #undef USE_UNNAMED_POSIX_SEMAPHORES
 
+/* Define to build with Wait Event Detail support. (--with-wait-event-detail)
+   */
+#undef USE_WAIT_EVENT_DETAIL
+
 /* Define to use native Windows API for random number generation */
 #undef USE_WIN32_RANDOM
 
diff --git src/include/pgstat.h src/include/pgstat.h
index d59c24a..dbd08b1 100644
--- src/include/pgstat.h
+++ src/include/pgstat.h
@@ -938,6 +938,33 @@ typedef enum ProgressCommandType
 
 #define PGSTAT_NUM_PROGRESS_PARAM	10
 
+#ifdef USE_WAIT_EVENT_DETAIL
+/* ----------
+ * Total number of wait event.
+ * Wait Classes num(9) + Wait Events last enum.
+ * ----------
+ */
+#define NUM_WAIT_LWLOCK				(LWTRANCHE_FIRST_USER_DEFINED)
+#define NUM_WAIT_LOCK				(LOCKTAG_LAST_TYPE + 1)
+#define NUM_WAIT_BUFFER_PIN			1
+#define NUM_WAIT_ACTIVITY			((WAIT_EVENT_WAL_WRITER_MAIN & 0x0000FFFF) + 1)
+#define NUM_WAIT_CLIENT				((WAIT_EVENT_WAL_SENDER_WRITE_DATA & 0x0000FFFF) + 1)
+#define NUM_WAIT_EXTENSION			1
+#define NUM_WAIT_IPC				((WAIT_EVENT_SYNC_REP & 0x0000FFFF) + 1)
+#define NUM_WAIT_TIMEOUT			((WAIT_EVENT_RECOVERY_APPLY_DELAY & 0x0000FFFF) + 1)
+#define NUM_WAIT_IO					((WAIT_EVENT_WAL_WRITE & 0x0000FFFF) + 1)
+
+#define NUM_WAIT_EVENT	(NUM_WAIT_LWLOCK		\
+						+ NUM_WAIT_LOCK			\
+						+ NUM_WAIT_BUFFER_PIN	\
+						+ NUM_WAIT_ACTIVITY		\
+						+ NUM_WAIT_CLIENT		\
+						+ NUM_WAIT_EXTENSION	\
+						+ NUM_WAIT_IPC			\
+						+ NUM_WAIT_TIMEOUT		\
+						+ NUM_WAIT_IO)
+#endif
+
 /* ----------
  * Shared-memory data structures
  * ----------
@@ -1041,6 +1068,18 @@ typedef struct PgBackendStatus
 	ProgressCommandType st_progress_command;
 	Oid			st_progress_command_target;
 	int64		st_progress_param[PGSTAT_NUM_PROGRESS_PARAM];
+
+#ifdef USE_WAIT_EVENT_DETAIL
+
+	/*
+	 * proc's wait_event additional information. each wait_events elapsed time
+	 * & count.
+	 */
+	TimestampTz st_wait_event_start_timestamp;
+	uint64		st_wait_event_total_elapsed[NUM_WAIT_EVENT];
+	uint64		st_wait_event_max_elapsed[NUM_WAIT_EVENT];
+	uint32		st_wait_event_counting[NUM_WAIT_EVENT];
+#endif
 } PgBackendStatus;
 
 /*
@@ -1218,6 +1257,13 @@ extern void pgstat_initstats(Relation rel);
 
 extern char *pgstat_clip_activity(const char *raw_activity);
 
+#ifdef USE_WAIT_EVENT_DETAIL
+extern void pgstat_report_wait_event_detail_start(uint32 wait_event_info);
+extern void pgstat_report_wait_event_detail_end(uint32 wait_event_info);
+extern uint32 pgstat_get_wait_event_info(int wait_event_array_index);
+#endif
+
+
 /* ----------
  * pgstat_report_wait_start() -
  *
@@ -1246,6 +1292,10 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+#ifdef USE_WAIT_EVENT_DETAIL
+	pgstat_report_wait_event_detail_start(wait_event_info);
+#endif
 }
 
 /* ----------
@@ -1265,6 +1315,10 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+#ifdef USE_WAIT_EVENT_DETAIL
+	pgstat_report_wait_event_detail_end(proc->wait_event_info);
+#endif
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
#2Michael Paquier
michael@paquier.xyz
In reply to: 임명규 (#1)
Re: [Proposal] Add accumulated statistics for wait event

On Mon, Jul 23, 2018 at 04:04:42PM +0900, 임명규 wrote:

This proposal is about recording additional statistics of wait events.

You should avoid sending things in html format, text format being
recommended on those mailing lists... The patch applies after using
patch -p0 by the way.

I would recommend that you generate your patches using "git
format-patch". Here are general guidelines on the matter:
https://wiki.postgresql.org/wiki/Submitting_a_Patch
Please study those guidelines, those are helpful if you want to get
yourself familiar with community process.

I have comments about your patch. First, I don't think that you need to
count precisely the number of wait events triggered as usually when it
comes to analyzing a workload's bottleneck what counts is a periodic
*sampling* of events, patterns which can be fetched already from
pg_stat_activity and stored say in a different place. This can be
achieved easily by using a cron job with an INSERT SELECT query which
adds data on a relation storing the event counts. I took the time to
look at your patch, and here is some feedback.

This does not need a configure switch. I assume that what you did is
good to learn the internals of ./configure though.

There is no documentation. What does the patch do? What is it useful
for?

+       case PG_WAIT_IPC:
+           arrayIndex = NUM_WAIT_LWLOCK +
+               NUM_WAIT_LOCK + NUM_WAIT_BUFFER_PIN +
+               NUM_WAIT_ACTIVITY + NUM_WAIT_CLIENT +
+               NUM_WAIT_EXTENSION + eventId;
+           break;
This is ugly and unmaintainable style.  You could perhaps have
considered an enum instead.

pg_stat_get_wait_events should be a set-returning function, which you
could filter at SQL level using a PID, so no need of it as an argument.

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.
--
Michael

#3Thomas Kellerer
spam_eater@gmx.net
In reply to: 임명규 (#1)
Re: [Proposal] Add accumulated statistics for wait event

This proposal is about recording additional statistics&nbsp;of&nbsp;wait

events.

The pg_stat_activity view is very useful in analysis for performance
issues.
But it is difficult to get information of wait events in detail,
when you need to deep dive into analysis of performance.
It is because pg_stat_activity just shows the current wait status of
backend.

There is an extension that samples the information from pg_stat_activity
(similar to Oracle's ASH).

https://github.com/postgrespro/pg_wait_sampling

Maybe it's worthwhile to combine the efforts?

--
Sent from: http://www.postgresql-archive.org/PostgreSQL-hackers-f1928748.html

#4Egor Rogov
e.rogov@postgrespro.ru
In reply to: Michael Paquier (#2)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

that will be a great feature.

On 23.07.2018 10:53, Michael Paquier wrote:

I have comments about your patch. First, I don't think that you need to
count precisely the number of wait events triggered as usually when it
comes to analyzing a workload's bottleneck what counts is a periodic
*sampling* of events

If I get it right, this patch is not about sampling. It gathers
cumulative statistics, which is regrettably missing in PostgreSQL.
That's why it should not only count exact number of wait events, but
also min time and stddev would be helpful (as in pg_stat_statements).

--
Egor Rogov
Postgres Professional http://www.postgrespro.com

#5Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Michael Paquier (#2)
Re: [Proposal] Add accumulated statistics for wait event

On Mon, Jul 23, 2018 at 10:53 AM Michael Paquier <michael@paquier.xyz> wrote:

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.

Yes, the question is overhead of measuring durations of individual
wait events. It has been proposed before, and there been heated
debates about that (see threads [1-3]). It doesn't seem to be a
conclusion about this feature. The thing to be said for sure:
performance penalty heavily depends on OS/hardware/workload. In some
cases overhead is negligible, but in other cases it appears to be
huge.

1. /messages/by-id/559D4729.9080704@postgrespro.ru
2. /messages/by-id/CA+TgmoYd3GTz2_mJfUHF+RPe-bCy75ytJeKVv9x-o+SonCGApw@mail.gmail.com
3. /messages/by-id/CAG95seUAQVj09KzLwU+z1B-GqdMqerzEkPFR3hn0q88XzMq-PA@mail.gmail.com

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#6Tom Lane
tgl@sss.pgh.pa.us
In reply to: Michael Paquier (#2)
Re: [Proposal] Add accumulated statistics for wait event

Michael Paquier <michael@paquier.xyz> writes:

This does not need a configure switch.

It probably is there because the OP realizes that most people wouldn't
accept having this code compiled in.

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.

On machines with slow gettimeofday(), I suspect the cost of this
patch would be staggering. Even with relatively fast gettimeofday,
it doesn't look acceptable for calls in hot code paths (for instance,
lwlock.c).

A bigger problem is that it breaks stuff. There are countless
calls to pgstat_report_wait_start/pgstat_report_wait_end that
assume they have no side-effects (for example, on errno) and
can never fail. I wouldn't trust GetCurrentTimestamp() for either.
If the report_wait calls can't be dropped into code with *complete*
certainty that they're safe, that's a big cost.

Why exactly is this insisting on logging timestamps and not,
say, just incrementing a counter? I think doing it like this
is almost certain to end in rejection.

regards, tom lane

#7MyungKyu LIM
myungkyu.lim@samsung.com
In reply to: Michael Paquier (#2)
RE: Re: [Proposal] Add accumulated statistics for wait event

 2018-07-23 16:53 (GMT+9), Michael Paquier wrote:

On Mon, Jul 23, 2018 at 04:04:42PM +0900, 임명규 wrote:

This proposal is about recording additional statistics of wait events.

I have comments about your patch. First, I don't think that you need to
count precisely the number of wait events triggered as usually when it
comes to analyzing a workload's bottleneck what counts is a periodic
*sampling* of events, patterns which can be fetched already from
pg_stat_activity and stored say in a different place.

Thanks for your feedback.

This proposal is not about *sampling*.
Accumulated statistics of wait events information is useful for solving
issue. It can measure accurate data.

Some case, sampling of events can not find the cause of issue. It lose detail data.
For example, some throughput issue occur(ex : disk io), but each wait point
occurs only a few milliseconds.
In this case, it is highly likely that will not find the cause.

This is ugly and unmaintainable style.

I'm sorry. You're right.
Think as the PoC.

What's the performance penalty?

I have same worries. I just tried pgbench several times.
Let me know what some good performance check method.

Best regards,
MyungKyu, Lim

#8MyungKyu LIM
myungkyu.lim@samsung.com
In reply to: Alexander Korotkov (#5)
RE: Re: [Proposal] Add accumulated statistics for wait event

On Mon, Jul 23, 2018 at 10:53 AM Michael Paquier <michael@paquier.xyz> wrote:

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.

Yes, the question is overhead of measuring durations of individual wait events. It has been proposed before, and there been heated debates about that (see threads [1-3]). It doesn't seem
to be a conclusion about this feature. The thing to be said for sure:
performance penalty heavily depends on OS/hardware/workload. In some cases overhead is negligible, but in other cases it appears to be huge.

Thanks for good information.
I agree. Performance penalty is exist.
But wait stats are demandable and useful. In some cases, it is worth sacrificing performance and using it.

So, what do you think about developing as extension? I have another concept proposal.
2. This feature can be implemented as extension if some hooks were provided in following functions,
- pgstat_report_wait_start
- pgstat_report_wait_end
This feature can be turned on/off by on-line config when necessary.

Best regards,
MyungKyu, Lim
 

#9Phil Florent
philflorent@hotmail.com
In reply to: MyungKyu LIM (#8)
RE: Re: [Proposal] Add accumulated statistics for wait event

Hi,

I am skeptical about accumulated statistics.

pg_stat_activity now gives necessary information about wait events. It can be easily be used with a polling system that sleeps most of the time to limit the overhead. Measuring the duration of individual wait events is not necessary to know the repartition of the charge.

You can aggregate the results of the polling by application, query, wait events or whatever you want.

I wrote a script for that that can be used interactively or in batch mode to produce reports but many solutions exist .

Best regards

Phil

<http://aka.ms/weboutlook&gt;

________________________________
De : MyungKyu LIM <myungkyu.lim@samsung.com>
Envoyé : mardi 24 juillet 2018 12:10
À : Alexander Korotkov; pgsql-hackers@postgresql.org
Cc : Woosung Sohn; DoHyung HONG
Objet : RE: Re: [Proposal] Add accumulated statistics for wait event

On Mon, Jul 23, 2018 at 10:53 AM Michael Paquier <michael@paquier.xyz> wrote:

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.

Yes, the question is overhead of measuring durations of individual wait events. It has been proposed before, and there been heated debates about that (see threads [1-3]). It doesn't seem
to be a conclusion about this feature. The thing to be said for sure:
performance penalty heavily depends on OS/hardware/workload. In some cases overhead is negligible, but in other cases it appears to be huge.

Thanks for good information.
I agree. Performance penalty is exist.
But wait stats are demandable and useful. In some cases, it is worth sacrificing performance and using it.

So, what do you think about developing as extension? I have another concept proposal.
2. This feature can be implemented as extension if some hooks were provided in following functions,
- pgstat_report_wait_start
- pgstat_report_wait_end
This feature can be turned on/off by on-line config when necessary.

Best regards,
MyungKyu, Lim

#10Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Tom Lane (#6)
Re: [Proposal] Add accumulated statistics for wait event

On 07/23/2018 03:57 PM, Tom Lane wrote:

Michael Paquier <michael@paquier.xyz> writes:

This does not need a configure switch.

It probably is there because the OP realizes that most people wouldn't
accept having this code compiled in.

What's the performance penalty? I am pretty sure that this is
measurable as wait events are stored for a backend for each I/O
operation as well, and you are calling a C routine within an inlined
function which is designed to be light-weight, doing only a four-byte
atomic operation.

On machines with slow gettimeofday(), I suspect the cost of this
patch would be staggering. Even with relatively fast gettimeofday,
it doesn't look acceptable for calls in hot code paths (for instance,
lwlock.c).

Yeah. I wonder if we could measure the time for a small fraction of the
wait events, and estimate the actual duration from that.

A bigger problem is that it breaks stuff. There are countless
calls to pgstat_report_wait_start/pgstat_report_wait_end that
assume they have no side-effects (for example, on errno) and
can never fail. I wouldn't trust GetCurrentTimestamp() for either.
If the report_wait calls can't be dropped into code with *complete*
certainty that they're safe, that's a big cost.

Why exactly is this insisting on logging timestamps and not,
say, just incrementing a counter? I think doing it like this
is almost certain to end in rejection.

Because the number of times you hit wait event may not correlate with
the time you spent waiting on it. So a simple counter is not the most
useful thing.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#11Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: MyungKyu LIM (#7)
Re: [Proposal] Add accumulated statistics for wait event

On 07/24/2018 12:06 PM, MyungKyu LIM wrote:

 2018-07-23 16:53 (GMT+9), Michael Paquier wrote:

On Mon, Jul 23, 2018 at 04:04:42PM +0900, 임명규 wrote:

This proposal is about recording additional statistics of wait events.

I have comments about your patch. First, I don't think that you need to
count precisely the number of wait events triggered as usually when it
comes to analyzing a workload's bottleneck what counts is a periodic
*sampling* of events, patterns which can be fetched already from
pg_stat_activity and stored say in a different place.

Thanks for your feedback.

This proposal is not about *sampling*.
Accumulated statistics of wait events information is useful for solving
issue. It can measure accurate data.

Some case, sampling of events can not find the cause of issue. It lose detail data.
For example, some throughput issue occur(ex : disk io), but each wait point
occurs only a few milliseconds.
In this case, it is highly likely that will not find the cause.

I think it's highly likely that it will find the cause. The idea of
sampling is that while you don't measure the timing directly, you can
infer it from the frequency of the wait events in the samples. So if you
see the backend reports a particular wait event in 75% of samples, it
probably spent 75% time waiting on it.

I'm not saying sampling is perfect and it certainly is less convenient
than what you propose.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#12Phil Florent
philflorent@hotmail.com
In reply to: Tomas Vondra (#11)
RE: [Proposal] Add accumulated statistics for wait event

Hi,

Some case, sampling of events can not find the cause of issue. It lose detail data.
For example, some throughput issue occur(ex : disk io), but each wait point
occurs only a few milliseconds.

It loses non meaningful details and it's in fact a good point. In this example, sampling will definitely find the cause and won't cost resources.

Being as precise as possible to define a wait event is very useful but knowing precisely the duration of each event is less useful in terms of tuning.

Example of sampling + group by/order by percentage of activity :

./t -d 5 -o "application_name, wait_event_type" -o "application_name, wait_event, wait_event_type"
traqueur 2.05.00 - performance tool for PostgreSQL 9.3 => 11
INFORMATION, no connection parameters provided, connecting to dedicated database ...
INFORMATION, connected to dedicated database traqueur
INFORMATION, PostgreSQL version : 110000
INFORMATION, sql preparation ...
INFORMATION, sql execution ...
busy_pc | distinct_exe | application_name | wait_event_type
---------+--------------+------------------+-----------------
206 | 8 / 103 | mperf |
62 | 2 / 31 | mperf | LWLock
20 | 3 / 10 | mperf | IO
12 | 1 / 6 | mperf | Client
(4 rows)

busy_pc | distinct_exe | application_name | wait_event | wait_event_type
---------+--------------+------------------+-----------------------+-----------------
206 | 8 / 103 | mperf | |
62 | 2 / 31 | mperf | WALWriteLock | LWLock
14 | 1 / 7 | mperf | DataFileImmediateSync | IO
12 | 1 / 6 | mperf | ClientRead | Client
2 | 1 / 1 | mperf | DataFileWrite | IO
2 | 1 / 1 | mperf | DataFileRead | IO
2 | 1 / 1 | mperf | WALInitWrite | IO

No need to know the exact duration of each event to identify the bottleneck(s)...

Best regards

Phil

________________________________
De : Tomas Vondra <tomas.vondra@2ndquadrant.com>
Envoyé : mardi 24 juillet 2018 17:45
À : pgsql-hackers@lists.postgresql.org
Objet : Re: [Proposal] Add accumulated statistics for wait event

On 07/24/2018 12:06 PM, MyungKyu LIM wrote:

2018-07-23 16:53 (GMT+9), Michael Paquier wrote:

On Mon, Jul 23, 2018 at 04:04:42PM +0900, 임명규 wrote:

This proposal is about recording additional statistics of wait events.

I have comments about your patch. First, I don't think that you need to
count precisely the number of wait events triggered as usually when it
comes to analyzing a workload's bottleneck what counts is a periodic
*sampling* of events, patterns which can be fetched already from
pg_stat_activity and stored say in a different place.

Thanks for your feedback.

This proposal is not about *sampling*.
Accumulated statistics of wait events information is useful for solving
issue. It can measure accurate data.

Some case, sampling of events can not find the cause of issue. It lose detail data.
For example, some throughput issue occur(ex : disk io), but each wait point
occurs only a few milliseconds.
In this case, it is highly likely that will not find the cause.

I think it's highly likely that it will find the cause. The idea of
sampling is that while you don't measure the timing directly, you can
infer it from the frequency of the wait events in the samples. So if you
see the backend reports a particular wait event in 75% of samples, it
probably spent 75% time waiting on it.

I'm not saying sampling is perfect and it certainly is less convenient
than what you propose.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#13Michael Paquier
michael@paquier.xyz
In reply to: Phil Florent (#12)
Re: [Proposal] Add accumulated statistics for wait event

On Tue, Jul 24, 2018 at 04:23:03PM +0000, Phil Florent wrote:

It loses non meaningful details and it's in fact a good point. In this
example, sampling will definitely find the cause and won't cost
resources.

The higher the sampling frequency, the more details you get, with the
most load on the instance. So if you are able to take an infinity of
samples, where registering multiple times the same event for the same
backend also matters because its overall weight gets higher and it shows
up higher in profiles, then you would be able converge to the set of
results that this patch adds. Sampling method, especially its
frequency, is something controlled by the client and not the server.
Approaches like the one proposed here push the load on the server-side,
unconditionally, for *all* backends, and this has its cost.

Even if you have spiky workloads, sampling may miss those, but even with
adding counters for each event you would need to query the table holding
the counters at an insane frequency to be able to perhaps get something
out of it as you need to do sampling of the counters as well to extract
deltas.

As Tomas has mentioned up-thread, sampling is light-weight, as-is the
current design for wait events. Even if it is not perfect because it
cannot give exact numbers, it would find bottlenecks in really most
cases, and that's what matters. If not, increasing the sampling
frequency makes things easier to detect as well. What would be the
point of taking only one sample every checkpoint for example?

There may be a benefit in having counters, I don't know the answer to
that, though the point would be to make sure that there is a specific
set of workloads where it makes sense, still my gut feeling is that
sampling would be able to detect those anyway.

(I am not a computer scientist by default but a physicist, think fluid
dynamics and turbulence, and I had my load of random events and signal
analysis as well. All that is just statistics with attempts to approach
reality, where sampling is a life-saver over exactitude of
measurements.)

Adding hooks is not acceptable to me either, those have a cost, and it
is not clear what's the benefit we can get into putting hooks in such a
place for cases other than sampling and counters...
--
Michael

#14Bertrand DROUVOT
bdrouvot@gmail.com
In reply to: Michael Paquier (#13)
Re: [Proposal] Add accumulated statistics for wait event

Hello Guys,

As you mentioned Oracle like active session history sampling in this
thread, I just want to let you know that I am working on a brand new
extension to provide this feature.

You can find the extension here: https://github.com/pgsentinel/pgsentinel

Basically, you could see it as samplings of pg_stat_activity (one second
interval as default) currently providing more information:

ash_time: the sampling time
blockers: the number of blockers
blockerpid: the pid of the blocker (if blockers = 1), the pid of
one blocker (if blockers > 1)
top_level_query: the top level statement (in case PL/pgSQL is used)
query: the statement being executed (not normalised, as it is in
pg_stat_statements, means you see the values)
cmdtype: the statement type
(SELECT,UPDATE,INSERT,DELETE,UTILITY,UNKNOWN,NOTHING)
queryid: the queryid of the statement (the one coming from
pg_stat_statements)

Thanks to the queryid field you are able to link the session activity with
the sql activity.

It's implemented as in-memory ring buffer where samples are written with
given (configurable) period.
Therefore, user can see some number of recent samples depending on history
size (configurable).

Current caveats: In case of high query rate per pid, you could see (I saw
it at more than 1000 queries per second) top_level_query and query not
"correlated" (query, queryid and cmdtype are still well linked together).
This is due to the fact that those 2 informations are currently collected
independently.

If you want to have a look, give your thoughts, you are welcome.

Bertrand

On 26 July 2018 at 03:24, Michael Paquier <michael@paquier.xyz> wrote:

Show quoted text

On Tue, Jul 24, 2018 at 04:23:03PM +0000, Phil Florent wrote:

It loses non meaningful details and it's in fact a good point. In this
example, sampling will definitely find the cause and won't cost
resources.

The higher the sampling frequency, the more details you get, with the
most load on the instance. So if you are able to take an infinity of
samples, where registering multiple times the same event for the same
backend also matters because its overall weight gets higher and it shows
up higher in profiles, then you would be able converge to the set of
results that this patch adds. Sampling method, especially its
frequency, is something controlled by the client and not the server.
Approaches like the one proposed here push the load on the server-side,
unconditionally, for *all* backends, and this has its cost.

Even if you have spiky workloads, sampling may miss those, but even with
adding counters for each event you would need to query the table holding
the counters at an insane frequency to be able to perhaps get something
out of it as you need to do sampling of the counters as well to extract
deltas.

As Tomas has mentioned up-thread, sampling is light-weight, as-is the
current design for wait events. Even if it is not perfect because it
cannot give exact numbers, it would find bottlenecks in really most
cases, and that's what matters. If not, increasing the sampling
frequency makes things easier to detect as well. What would be the
point of taking only one sample every checkpoint for example?

There may be a benefit in having counters, I don't know the answer to
that, though the point would be to make sure that there is a specific
set of workloads where it makes sense, still my gut feeling is that
sampling would be able to detect those anyway.

(I am not a computer scientist by default but a physicist, think fluid
dynamics and turbulence, and I had my load of random events and signal
analysis as well. All that is just statistics with attempts to approach
reality, where sampling is a life-saver over exactitude of
measurements.)

Adding hooks is not acceptable to me either, those have a cost, and it
is not clear what's the benefit we can get into putting hooks in such a
place for cases other than sampling and counters...
--
Michael

#15Phil Florent
philflorent@hotmail.com
In reply to: Michael Paquier (#13)
RE: [Proposal] Add accumulated statistics for wait event

Hi,

I agree with that. PostgreSQL 10 is really great, tuning tools based on sampling of pg_stat_activity became accurate without any modification.

Best regards

Phil

________________________________
De : Michael Paquier <michael@paquier.xyz>
Envoyé : jeudi 26 juillet 2018 03:24
À : Phil Florent
Cc : Tomas Vondra; pgsql-hackers@lists.postgresql.org
Objet : Re: [Proposal] Add accumulated statistics for wait event

On Tue, Jul 24, 2018 at 04:23:03PM +0000, Phil Florent wrote:

It loses non meaningful details and it's in fact a good point. In this
example, sampling will definitely find the cause and won't cost
resources.

The higher the sampling frequency, the more details you get, with the
most load on the instance. So if you are able to take an infinity of
samples, where registering multiple times the same event for the same
backend also matters because its overall weight gets higher and it shows
up higher in profiles, then you would be able converge to the set of
results that this patch adds. Sampling method, especially its
frequency, is something controlled by the client and not the server.
Approaches like the one proposed here push the load on the server-side,
unconditionally, for *all* backends, and this has its cost.

Even if you have spiky workloads, sampling may miss those, but even with
adding counters for each event you would need to query the table holding
the counters at an insane frequency to be able to perhaps get something
out of it as you need to do sampling of the counters as well to extract
deltas.

As Tomas has mentioned up-thread, sampling is light-weight, as-is the
current design for wait events. Even if it is not perfect because it
cannot give exact numbers, it would find bottlenecks in really most
cases, and that's what matters. If not, increasing the sampling
frequency makes things easier to detect as well. What would be the
point of taking only one sample every checkpoint for example?

There may be a benefit in having counters, I don't know the answer to
that, though the point would be to make sure that there is a specific
set of workloads where it makes sense, still my gut feeling is that
sampling would be able to detect those anyway.

(I am not a computer scientist by default but a physicist, think fluid
dynamics and turbulence, and I had my load of random events and signal
analysis as well. All that is just statistics with attempts to approach
reality, where sampling is a life-saver over exactitude of
measurements.)

Adding hooks is not acceptable to me either, those have a cost, and it
is not clear what's the benefit we can get into putting hooks in such a
place for cases other than sampling and counters...
--
Michael

#16Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Michael Paquier (#13)
RE: [Proposal] Add accumulated statistics for wait event

On Thu, July 26, 2018 at 1:25 AM, Michael Paquier wrote:

Even if you have spiky workloads, sampling may miss those, but even with adding counters for each event
you would need to query the table holding the counters at an insane frequency to be able to perhaps get
something out of it as you need to do sampling of the counters as well to extract deltas.

Hi, I was wondering why PostgreSQL did not have the number of wait events and wait time that other databases such as Oracle had as a function, and when I was looking for related threads, I got to this thread.

I am a DB beginner, so please tell me. It says that you can find events that are bottlenecks in sampling, but as you saw above, you can not find events shorter than the sampling interval, right?
If this short event has occurred quite a number of times and it was a considerable amount of time in total, can you solve this problem with sampling?
# I have asked, but I could not understand much of the discussion above and I do not know if such a case can exist.
Also, I think that it can be solved by higher the sampling frequency, but the load will be high, right? I think this method is not very practical.

Moreover, I think that it is not implemented due to the reason that sampling is good as described above and because it affects performance.
How about switching the function on / off and implementing with default off?
Do you care about performance degradation during bottleneck investigation?
When investigating the bottleneck, I think that it is better to find the cause even at the expense of performance.
# If you can detect with high frequency sampling, I think that it depends on whether the sampling or the function(the number of wait events and wait time) is high load.

Since I am a DB beginner, I think that it is saying strange things.
I am glad if you tell me.

-----
Naoki Yotsunaga

#17Phil Florent
philflorent@hotmail.com
In reply to: Yotsunaga, Naoki (#16)
RE: [Proposal] Add accumulated statistics for wait event

Hi,
I am a DB beginner, so please tell me. It says that you can find events that are bottlenecks in sampling, but as you saw above, you can not find events shorter than the sampling interval, right?

If an event occurs frequently and if it is reported in pg_stat_activity, you will catch it again and again while sampling, no matter it duration.
Hence you just need to

* Sample the sessions and consider the active ones. You need to know if they are waiting (PostgreSQL now provides detailed wait events) or if they are on the CPU
* Optionally collect information on the system context at the time of sampling (CPU, memory...), it can be provided by many tools like psutil python library for example

If the client application itself provides information it's even more interesting. With something like program/module/action/client_info/sofar/totalwork in application_name you are able to focus directly on different kind of activity. It can give you information like "I/O waits are meaningful for my batch activity but not for my OLTP activity, if my goal is to improve response time for end users I have to consider that."

Best regards
Phil

________________________________
De : Yotsunaga, Naoki <yotsunaga.naoki@jp.fujitsu.com>
Envoyé : jeudi 4 octobre 2018 10:31
À : 'Michael Paquier'; Phil Florent
Cc : Tomas Vondra; pgsql-hackers@lists.postgresql.org
Objet : RE: [Proposal] Add accumulated statistics for wait event

On Thu, July 26, 2018 at 1:25 AM, Michael Paquier wrote:

Even if you have spiky workloads, sampling may miss those, but even with adding counters for each event
you would need to query the table holding the counters at an insane frequency to be able to perhaps get
something out of it as you need to do sampling of the counters as well to extract deltas.

Hi, I was wondering why PostgreSQL did not have the number of wait events and wait time that other databases such as Oracle had as a function, and when I was looking for related threads, I got to this thread.

I am a DB beginner, so please tell me. It says that you can find events that are bottlenecks in sampling, but as you saw above, you can not find events shorter than the sampling interval, right?
If this short event has occurred quite a number of times and it was a considerable amount of time in total, can you solve this problem with sampling?
# I have asked, but I could not understand much of the discussion above and I do not know if such a case can exist.
Also, I think that it can be solved by higher the sampling frequency, but the load will be high, right? I think this method is not very practical.

Moreover, I think that it is not implemented due to the reason that sampling is good as described above and because it affects performance.
How about switching the function on / off and implementing with default off?
Do you care about performance degradation during bottleneck investigation?
When investigating the bottleneck, I think that it is better to find the cause even at the expense of performance.
# If you can detect with high frequency sampling, I think that it depends on whether the sampling or the function(the number of wait events and wait time) is high load.

Since I am a DB beginner, I think that it is saying strange things.
I am glad if you tell me.

-----
Naoki Yotsunaga

#18Michael Paquier
michael@paquier.xyz
In reply to: Phil Florent (#17)
Re: [Proposal] Add accumulated statistics for wait event

On Thu, Oct 04, 2018 at 09:32:37AM +0000, Phil Florent wrote:

I am a DB beginner, so please tell me. It says that you can find
events that are bottlenecks in sampling, but as you saw above, you can
not find events shorter than the sampling interval, right?

Yes, which is why it would be as simple as making the interval shorter,
still not too short so as it bloats the amount of information fetched
which needs to be stored and afterwards (perhaps) treated for analysis.
This gets rather close to signal processing. A simple image is for
example, assuming that event A happens 100 times in an interval of 1s,
and event B only once in the same interval of 1s, then if the snapshot
interval is only 1s, then in the worst case A would be treated an equal
of B, which would be wrong.
--
Michael

#19Phil Florent
philflorent@hotmail.com
In reply to: Michael Paquier (#18)
RE: [Proposal] Add accumulated statistics for wait event

Hi,

It's the same logic with any polling system. An integration calculation using monte-carlo method with only a few points won't be accurate enough and can even be completely wrong etc.
Polling is OK to troubleshoot a problem on the fly but 2 points are not enough. A few seconds are needed to obtain good enough data, e.g 5-10 seconds of polling with a 0.1=>0.01s interval between 2 queries of the activity.
Polling a few seconds while the user is waiting is normally enough to say if a significant part of the waits are on the database. It's very important to know that. With 1 hour of accumulated statistics, a DBA will always see something to fix. But if the user waits 10 seconds on a particular screen and 1 second is spent on the database it often won't directly help.
Polling gives great information with postgreSQL 10 but it was already useful to catch top queries etc. in older versions.
I always check if activity is adequately reported by my tool using known cases. I want to be sure it will report adequately things in real-world troubleshooting sessions. Sometimes there are bugs in my tool, once there was an issue with postgres (pgstat_report_activty() was not called by workers in parallel index creation)

Best regards
Phil

De : Michael Paquier <michael@paquier.xyz>
Envoyé : jeudi 4 octobre 2018 12:58
À : Phil Florent
Cc : Yotsunaga, Naoki; Tomas Vondra; pgsql-hackers@lists.postgresql.org
Objet : Re: [Proposal] Add accumulated statistics for wait event

On Thu, Oct 04, 2018 at 09:32:37AM +0000, Phil Florent wrote:

I am a DB beginner, so please tell me. It says that you can find
events that are bottlenecks in sampling, but as you saw above, you can
not find events shorter than the sampling interval, right?

Yes, which is why it would be as simple as making the interval shorter,
still not too short so as it bloats the amount of information fetched
which needs to be stored and afterwards (perhaps) treated for analysis.
This gets rather close to signal processing. A simple image is for
example, assuming that event A happens 100 times in an interval of 1s,
and event B only once in the same interval of 1s, then if the snapshot
interval is only 1s, then in the worst case A would be treated an equal
of B, which would be wrong.
--
Michael

#20Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Phil Florent (#19)
RE: [Proposal] Add accumulated statistics for wait event

On Thu, Oct 4, 2018 at 0:54 AM, Phil Florent wrote:

Phil, Michael, I appreciate your polite comments.
I understand as follows.
We can find it if we shorten the sampling interval, but a lot of information comes out.
# The balance is important.
Also, it is not good unless we have enough samples.
And I have to do various other things.
Is my understand correct?

It seems to me that it is difficult for me if my understanding is right.
Is DBA really able to solve bottlenecks with sampling?
# Since I am a beginner, I feel that way. And other people may not feel it difficult.

What I would like to say is that if we have information on the number of wait events and the wait time(like other DB), we can investigate more easily.
Of course, I understand that it also affects performance. So, I suggest a way that it can switch on and off, defaults is off.

-----
Naoki, Yotsunaga.

#21legrand legrand
legrand_legrand@hotmail.com
In reply to: Bertrand DROUVOT (#14)
Re: [Proposal] Add accumulated statistics for wait event

Bertrand DROUVOT wrote

Hello Guys,

As you mentioned Oracle like active session history sampling in this
thread, I just want to let you know that I am working on a brand new
extension to provide this feature.

You can find the extension here: https://github.com/pgsentinel/pgsentinel

[...]

If you want to have a look, give your thoughts, you are welcome.

Bertrand

+1!

just a regret: ash sampling interval can not be smaller than a second ;o(

Cordialement
PAscal

--
Sent from: http://www.postgresql-archive.org/PostgreSQL-hackers-f1928748.html

#22Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Yotsunaga, Naoki (#20)
RE: [Proposal] Add accumulated statistics for wait event

On Thu, Oct 4, 2018 at 8:22 PM, Yotsunaga Naoki wrote:

Hi, I understood and thought of your statistic comment once again. In the case of sampling, is there enough statistic to investigate?
In the case of a long SQL, I think that it is possible to obtain a sufficient sampling number.

However, in the case of about 1 minute of SQL, only 60 samples can be obtained at most.
#Because legard’s comment.
/messages/by-id/1539158356795-0.post@n3.nabble.com

Does this sampling number of 60 give information that I really want?
Perhaps it is not to miss the real problem part?
---------------------------------------
Naoki, Yotsunaga.

#23legrand legrand
legrand_legrand@hotmail.com
In reply to: Yotsunaga, Naoki (#22)
RE: [Proposal] Add accumulated statistics for wait event

Hello,
You are right, sampling has to be "tuned" regarding the event(s) you want to
catch.

Sampling of 1 second interval is good with treatments that take hours, and
not enough for a minute or a second analysis.

May I invite you to try it, using PASH-viewer (github) with pgsentinel
(github).

Changing pgsentiel.c sampling from 1 second

rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
ash_sampling_period * 1000L,PG_WAIT_EXTENSION);

to 1/10 second
rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
ash_sampling_period * 100L,PG_WAIT_EXTENSION);

seems the good balance for me (for analysis periods from a few seconds to
minutes).

Regards
PAscal

--
Sent from: http://www.postgresql-archive.org/PostgreSQL-hackers-f1928748.html

#24Phil Florent
philflorent@hotmail.com
In reply to: Yotsunaga, Naoki (#22)
RE: [Proposal] Add accumulated statistics for wait event

Hi,

Is DBA really able to solve bottlenecks with sampling?

What I would like to say is that if we have information on the number of wait events and the wait time(like other DB), we can investigate more easily.

Yes you will be able to solve bottlenecks with sampling. In interactive mode, a 1s interval is probably too large. I use 0s1 - 0s01 with my tool and it is normally OK. In batch mode I use 1s=>10s. If you want to visualize the results it's easy to use a dedicated tool and bottlenecks will clearly appear .
Since grafana is now able to connect directly to a postgresql source, I use it to display the information collected from pg_stat_activity and psutil ( e.g https://pgphil.ovh/traqueur_dashboard_02.php page is written in french but panels are in english)

Other DB have accumulated statistics but you can notice that sampling is also their most modern method.
E.g Oracle DB : 20 years ago you already had tools like "utlbstat/utlestat" . Then you had "statspack". Those tools were based on accumulated statistics and the reports were based on differences between 2 points. It was useful to solve major problems but it was limited and not precise enough in many cases.

The preferred feature to identify bottlenecks in the Oracle world is now ASH (active session history). It can help with major problems, specific problems AND it can identify short blockages.
Too bad it is licensed as an option of their Enterprise Edition but similar tools exist and they are also based on sampling of the activity.

With the "official" ASH, sampling and archiving are done internally and you have a circular memory zone dedicated to the feature. Hence the overhead is lower but that's all.

The most advanced interactive tool is called "snapper" and it is also based on sampling.

Best regards
Phil

________________________________
De : Yotsunaga, Naoki <yotsunaga.naoki@jp.fujitsu.com>
Envoyé : lundi 29 octobre 2018 02:20
À : 'Phil Florent'; 'Michael Paquier'
Cc : 'Tomas Vondra'; 'pgsql-hackers@lists.postgresql.org'
Objet : RE: [Proposal] Add accumulated statistics for wait event

On Thu, Oct 4, 2018 at 8:22 PM, Yotsunaga Naoki wrote:

Hi, I understood and thought of your statistic comment once again. In the case of sampling, is there enough statistic to investigate?

In the case of a long SQL, I think that it is possible to obtain a sufficient sampling number.

However, in the case of about 1 minute of SQL, only 60 samples can be obtained at most.

#Because legard’s comment.

/messages/by-id/1539158356795-0.post@n3.nabble.com

Does this sampling number of 60 give information that I really want?

Perhaps it is not to miss the real problem part?

---------------------------------------

Naoki, Yotsunaga.

#25Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Phil Florent (#24)
RE: [Proposal] Add accumulated statistics for wait event

On Mon, Oct 29, 2018 at 1:52 AM, Phil Florent wrote:

Hi, thank you for comments.

Yes you will be able to solve bottlenecks with sampling. In interactive mode, a 1s interval is probably too large. I use 0s1 - 0s01 with my tool and it is normally OK.

With the tool you are using, can you sample at intervals shorter than 1 second?
If you can, you can get enough sampling number and you can also acquire short events.

Since grafana is now able to connect directly to a postgresql source, I use it to display the information collected from pg_stat_activity and psutil ( e.g https://pgphil.ovh/traqueur_dashboard_02.php page is written in french but panels are in english)

It is wonderful to visualize.
Especially for beginners like me.

Other DB have accumulated statistics but you can notice that sampling is also their most modern method.
E.g Oracle DB : 20 years ago you already had tools like "utlbstat/utlestat" . Then you had "statspack". Those tools were based on accumulated statistics and the reports were based on differences between 2 points. It was useful to solve major problems but it was limited and not precise enough in many cases.

The preferred feature to identify bottlenecks in the Oracle world is now ASH (active session history). It can help with major problems, specific problems AND it can identify short blockages.
Too bad it is licensed as an option of their Enterprise Edition but similar tools exist and they are also based on sampling of the activity.

With the "official" ASH, sampling and archiving are done internally and you have a circular memory zone dedicated to the feature. Hence the overhead is lower but that's all.

The most advanced interactive tool is called "snapper" and it is also based on sampling.

Thanks. I will check it.

The current bottleneck survey method, from sampling, I can know the number (ratio) of waiting events.
Then, investigate from those with a high number of times (ratio).
Do you agree with this recognition?

---------------------------------------

Naoki, Yotsunaga.

#26Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Michael Paquier (#2)
RE: [Proposal] Add accumulated statistics

On Sat, Nov 3, 2018 at 1:28 AM, Phil Florent wrote:

2) it consumes system resources

While the system is running, you are always sampling system information, do not you? Like Oracle ASH.
If so, does sampling have no significant impact on performance? Even if the interval is 0.01 s or more.

The main interest of sampling is to discard negligible activity to allow the DBA to work on meaningful queries and events.

In other words, do you mean narrowing down candidate of problems?

---------------------------
Naoki Yotsunaga

#27Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Yotsunaga, Naoki (#26)
RE: [Proposal] Add accumulated statistics

On Mon, Nov 5, 2018 at 4:26 PM, Naoki Yotsunaga wrote:

2) it consumes system resources

While the system is running, you are always sampling system information, do not you? Like Oracle ASH.

I don’t understand well how sampling is used.
In which scene do you use the sampling? Or is it both scenes? Or is it a different scene?

A) Perform sampling in order to obtain information on the entire DB while DB is in operation.

B) I know roughly the process to be late. When investigating the reason for delaying the processing, sampling is performed on the processing.

---------------------------
Naoki Yotsunaga

#28Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: legrand legrand (#23)
RE: [Proposal] Add accumulated statistics for wait event

On Sun, Oct 28, 2018 at 6:39 PM, legrand legrand wrote:

Hi, Thanks for comments.
I had overlooked the reply from you.

You are right, sampling has to be "tuned" regarding the event(s) you want to catch.

I see. For tuning, you need to know the length of processing you want to sample?

May I invite you to try it, using PASH-viewer (github) with pgsentinel (github).

I'll check and try it.

------------------
Naoki Yotsunaga

#29Bruce Momjian
bruce@momjian.us
In reply to: Yotsunaga, Naoki (#26)
Re: [Proposal] Add accumulated statistics

On Tue, Nov 6, 2018 at 04:26:03AM +0000, Yotsunaga, Naoki wrote:

On Sat, Nov 3, 2018 at 1:28 AM, Phil Florent wrote:

2) it consumes system resources

While the system is running, you are always sampling system information, do not
you? Like Oracle ASH.

If so, does sampling have no significant impact on performance? Even if the
interval is 0.01 s or more.

I am replying late, but one of the reasons that sampling is used is that
decreasing the sampling interval increases to overhead of the sampling
process, but doesn't affect the running backends.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ As you are, so once was I.  As I am, so you will be. +
+                      Ancient Roman grave inscription +
#30Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Bruce Momjian (#29)
RE: [Proposal] Add accumulated statistics

On Wed, Nov 21, 2018 at 9:27 PM, Bruce Momjian wrote:

Hi, thank you for the information.
I understood that sampling is effective for investigation of waiting events.

By the way, you can see the number of wait events with "LWLOCK_STATS", right?
Is this function implemented because it is necessary to know the number of waiting events for investigation?
If so, is not that the number of wait events is useful information?
Now, I need to rebuild to get this information and I feel inconvenience.

So, how about checking the number of wait events in the view?
Also, I think that it will be useful if you know the waiting time.
I think that it is easy to investigate when it is clearly known how long waiting time is occupied with processing time.

--
Naoki Yotsunaga

#31Tsunakawa, Takayuki
tsunakawa.takay@jp.fujitsu.com
In reply to: Yotsunaga, Naoki (#30)
RE: [Proposal] Add accumulated statistics

From: Yotsunaga, Naoki [mailto:yotsunaga.naoki@jp.fujitsu.com]

By the way, you can see the number of wait events with "LWLOCK_STATS", right?
Is this function implemented because it is necessary to know the number
of waiting events for investigation?
If so, is not that the number of wait events is useful information?
Now, I need to rebuild to get this information and I feel inconvenience.

So, how about checking the number of wait events in the view?
Also, I think that it will be useful if you know the waiting time.
I think that it is easy to investigate when it is clearly known how long
waiting time is occupied with processing time.

That's interesting. It should be convenient for PG developers to be able to see wait events of each session and the whole instance to improve PostgreSQL. I developed a flight recorder feature for a proprietary database (not based on PostgreSQL), which recorded various events, including waits, in the ring buffer for each session. It could dump the ring buffers into external files in CSV format, so that we can load them into a database or a spreadsheet to analyze the time model of SQL execution. That helped us a lot to eliminate many bottlenecks. MySQL says a similar thing about its Performance Schema:

https://dev.mysql.com/doc/refman/8.0/en/performance-schema-examples.html

"Tune the code (this applies to storage engine or server developers only)."

Regards
Takayuki Tsunakawa

#32Tsunakawa, Takayuki
tsunakawa.takay@jp.fujitsu.com
In reply to: Yotsunaga, Naoki (#30)
RE: [Proposal] Add accumulated statistics

Hi all,

I think sampling like Oracle ASH should work for the DBA to find probable bottlenecks in many cases (, so I hope PostgreSQL will incorporate it...) On the other hand, it seems to have the following disadvantages, some of which others have already pointed out:

1. Doesn't provide precise data
Sampling could miss intermittent short waits, e.g., buffer content lock waits during checkpoints. This might make it difficult or impossible to solve transient performance problems, such as infrequent 100 millisecond response times while the normal response time is a few milliseconds.
The proposed wait event collection doesn't miss anything.

2. Overuses resources
We may be able to shorten the sampling interval to 10 ms or even 1 ms to detect short periods of problems. However, the sampled data of active sessions become voluminous in memory and storage. It would take longer to analyze those samples. Also, the background sampling process prevents the CPU core from becoming idle to save power, which bgwriter and walwriter tries to avoid by hibernation.
The proposed wait event collection just records what actually happened. No waste. Would it use many resources if waits happen frequently? That leads to our motivation to reduce waits.

3. Cannot determine the impact or illness of waits just by sampling or counting without time
As the following MySQL and Oracle manual articles describe, precise measurement of wait count and time helps to judge the impact and justify the remedy. They can measure the whole SQL execution and its various processing steps (parse, plan, sort, etc.) as well as waits, so that the most significant areas can be determined.
Also, sampling cannot tell if a single wait took long or the same waits occurred repeatedly in succession (at least easily.) Do the sampled waits indicate an abnormal I/O (which took 2 ms while the normal time is 50 us)?

[MySQL]

Chapter 26 MySQL Performance Schema
https://dev.mysql.com/doc/refman/8.0/en/performance-schema.html
--------------------------------------------------
The Performance Schema monitors server events. An “event” is anything the server does that takes time and has been instrumented so that timing information can be collected. In general, an event could be a function call, a wait for the operating system, a stage of an SQL statement execution such as parsing or sorting, or an entire statement or group of statements. Event collection provides access to information about synchronization calls (such as for mutexes) file and table I/O, table locks, and so forth for the server and for several storage engines.

Current events are available, as well as event histories and summaries. This enables you to determine how many times instrumented activities were performed and how much time they took. Event information is available to show the activities of specific threads, or activity associated with particular objects such as a mutex or file.
--------------------------------------------------

[Oracle]

https://docs.oracle.com/en/database/oracle/oracle-database/18/tdppt/automatic-database-performance-monitoring.html#GUID-32E92AEC-AF1A-4602-B998-3250920CD3BE
--------------------------------------------------
The goal of database performance tuning is to reduce the DB time of the system for a given workload. By reducing DB time, the database can support more user requests by using the same or fewer resources. ADDM reports system resources that are using a significant portion of DB time as problem areas and sorts them in descending order by the amount of related DB time spent.
--------------------------------------------------

Instance Tuning Using Performance Views
https://docs.oracle.com/en/database/oracle/oracle-database/18/tgdba/instance-tuning-using-performance-views.html#GUID-07982549-507F-4465-8843-7F753BCF8F99
--------------------------------------------------
Wait event statistics include the number of times an event was waited for and the time waited for the event to complete. If the initialization parameter TIMED_STATISTICS is set to true, then you can also see how long each resource was waited for.
To minimize user response time, reduce the time spent by server processes waiting for event completion. Not all wait events have the same wait time. Therefore, it is more important to examine events with the most total time waited rather than wait events with a high number of occurrences. Usually, it is best to set the dynamic parameter TIMED_STATISTICS to true at least while monitoring performance.
--------------------------------------------------

https://docs.oracle.com/en/database/oracle/oracle-database/18/tgdba/measuring-database-performance.html#GUID-811E9E65-C64A-4028-A90E-102BBFF6E68F
5.2.3 Using Wait Events without Timed Statistics
--------------------------------------------------
If TIMED_STATISTICS is set to FALSE, then the amount of time spent waiting for an event is not available. Therefore, it is only possible to order wait events by the number of times each event was waited for. Although the events with the largest number of waits might indicate a potential bottleneck, they might not be the main bottleneck. This situation can happen when an event is waited for a large number of times, but the total time waited for that event is small. Conversely, an event with fewer waits might be a bigger bottleneck if the wait time accounts for a significant proportion of the total wait time. Without the wait times to use for comparison, it is difficult to determine whether a wait event is worth investigating.
--------------------------------------------------

10.2.2 Using Wait Event Statistics to Drill Down to Bottlenecks
--------------------------------------------------
The most effective way to use wait event data is to order the events by the wait time. This is only possible if TIMED_STATISTICS is set to true. Otherwise, the wait events can only be ranked by the number of times waited, which is often not the ordering that best represents the problem.

To get an indication of where time is spent, follow these steps:

1. Examine the data collection for V$SYSTEM_EVENT. The events of interest should be ranked by wait time.
Identify the wait events that have the most significant percentage of wait time. ...
Alternatively, look at the Top 5 Timed Events section at the beginning of the Automatic Workload Repository report. This section automatically orders the wait events (omitting idle events), and calculates the relative percentage:

Top 5 Timed Events
~~~~~~~~~~~~~~~~~~ % Total
Event Waits Time (s) Call Time
-------------------------------------- ------------ ----------- ---------
CPU time 559 88.80
log file parallel write 2,181 28 4.42
SQL*Net more data from client 516,611 27 4.24
db file parallel write 13,383 13 2.04
db file sequential read 563 2 .27

2. Look at the number of waits for these events, and the average wait time. For example, for I/O related events, the average time might help identify whether the I/O system is slow. The following example of this data is taken from the Wait Event section of the AWR report:

Avg
Total Wait wait Waits
Event Waits Timeouts Time (s) (ms) /txn
--------------------------- --------- --------- ---------- ------ ---------
log file parallel write 2,181 0 28 13 41.2
SQL*Net more data from clie 516,611 0 27 0 9,747.4
db file parallel write 13,383 0 13 1 252.5

3. The top wait events identify the next places to investigate. A table of common wait events is listed in Table 10-1. It is usually a good idea to also have quick look at high-load SQL.

4. Examine the related data indicated by the wait events to see what other information this data provides. Determine whether this information is consistent with the wait event data. In most situations, there is enough data to begin developing a theory about the potential causes of the performance bottleneck.

5. To determine whether this theory is valid, cross-check data you have examined with other statistics available for consistency. The appropriate statistics vary depending on the problem, but usually include load profile-related data in V$SYSSTAT, operating system statistics, and so on. Perform cross-checks with other data to confirm or refute the developing theory.
--------------------------------------------------

So, why don't we have the proposed wait event count/time data? I hope we can nurture this to become a database profiling tool like MySQL and Oracle. This is the first step. I think it would be useful to have both sampling and precise statistics. Oracle has both, and MySQL has the latter (I don't know why MySQL doesn't provide sampling, because the Performance Schema should probably have been developed after Oracle's ASH.)

What would make us conservative about doing this? Skimming the old thread, the remaining concern is the timer overhead. As the following article suggests, some lightweight timers seem to be available. We can turn the timing off by default if they aren't light enough.

Performance Schema Timers
https://dev.mysql.com/doc/refman/8.0/en/performance-schema-timing.html

Regards
Takayuki Tsunakawa

#33Adrien NAYRAT
adrien.nayrat@anayrat.info
In reply to: Tsunakawa, Takayuki (#32)
Re: [Proposal] Add accumulated statistics

On 1/7/19 6:34 AM, Tsunakawa, Takayuki wrote:

1. Doesn't provide precise data
Sampling could miss intermittent short waits, e.g., buffer content lock waits during checkpoints. This might make it difficult or impossible to solve transient performance problems, such as infrequent 100 millisecond response times while the normal response time is a few milliseconds.
The proposed wait event collection doesn't miss anything.

2. Overuses resources
We may be able to shorten the sampling interval to 10 ms or even 1 ms to detect short periods of problems. However, the sampled data of active sessions become voluminous in memory and storage. It would take longer to analyze those samples. Also, the background sampling process prevents the CPU core from becoming idle to save power, which bgwriter and walwriter tries to avoid by hibernation.
The proposed wait event collection just records what actually happened. No waste. Would it use many resources if waits happen frequently? That leads to our motivation to reduce waits.

FIY, wait events have been added in PoWA by using pg_wait_sampling
extension :
https://rjuju.github.io/postgresql/2018/07/09/wait-events-support-for-powa.html

pg_wait_sampling sample the wait events in shared memory and PoWA store
them.

#34Tsunakawa, Takayuki
tsunakawa.takay@jp.fujitsu.com
In reply to: Adrien NAYRAT (#33)
RE: [Proposal] Add accumulated statistics

From: Adrien NAYRAT [mailto:adrien.nayrat@anayrat.info]

FIY, wait events have been added in PoWA by using pg_wait_sampling
extension :
https://rjuju.github.io/postgresql/2018/07/09/wait-events-support-for-
powa.html

pg_wait_sampling sample the wait events in shared memory and PoWA store
them.

Great. Also FYI, Amazon RDS/Aurora Performance Insights already provides a load profiling feature based on the wait events collected from pg_stat_activity, which samples once a second. An intuitive 5 minute video for introduction is here:

Using Amazon RDS Performance Insights
https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PerfInsights.html

Although I cannot see the graphics as I'm almost blind, the explanation sounds like it mimics Oracle Enterprise Manager. Cool.

Aren't you thinking of incorporating your work into PostgreSQL as a contrib like pg_stat_statements?

Regards
Takayuki Tsunakawa

#35Robert Haas
robertmhaas@gmail.com
In reply to: Yotsunaga, Naoki (#30)
Re: [Proposal] Add accumulated statistics

On Thu, Dec 20, 2018 at 8:48 PM Yotsunaga, Naoki
<yotsunaga.naoki@jp.fujitsu.com> wrote:

If so, is not that the number of wait events is useful information?

My theory is that the number of wait events is NOT useful information,
or at least not nearly as useful the results of a sampling approach.
The data that LWLOCK_STATS produce are downright misleading -- they
lead you to think that the bottlenecks are in different places than
they really are, because the locks that produce the most waiting can
be 5th or 10th in terms of the number of wait events.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#36Yotsunaga, Naoki
yotsunaga.naoki@jp.fujitsu.com
In reply to: Robert Haas (#35)
RE: [Proposal] Add accumulated statistics

On Thu, Jan 10, 2019 at 8:42 PM, Robert Hass wrote:

Thanks for comments.

or at least not nearly as useful the results of a sampling approach.

I agree with your opinion.
Because it can't be asserted that the wait event is a bottleneck just because the number of wait event is large.
The same thing is mentioned in Oracle.
It also suggests that it is important to acquire waiting time for that.
----
https://docs.oracle.com/en/database/oracle/oracle-database/18/tgdba/measuring-database-performance.html#GUID-811E9E65-C64A-4028-A90E-102BBFF6E68F
5.2.3 Using Wait Events without Timed Statistics
----

The data that LWLOCK_STATS produce are downright misleading

Is that so?
I do not know the need for this function.

---
Naoki Yotsunaga

#37Tsunakawa, Takayuki
tsunakawa.takay@jp.fujitsu.com
In reply to: Robert Haas (#35)
RE: [Proposal] Add accumulated statistics

From: Robert Haas [mailto:robertmhaas@gmail.com]

My theory is that the number of wait events is NOT useful information,
or at least not nearly as useful the results of a sampling approach.
The data that LWLOCK_STATS produce are downright misleading -- they
lead you to think that the bottlenecks are in different places than
they really are, because the locks that produce the most waiting can
be 5th or 10th in terms of the number of wait events.

I understood you're saying that the number of waits alone does not necessarily indicate the bottleneck, because a wait with fewer counts but longer time can take a large portion of the entire SQL execution time. So, wait time is also useful. I think that's why Oracle describes and MySQL provides precise count and time without sampling.

Haven't LOCK_STATS been helpful for PG developers? IIRC, it was used to pinpoint the bottleneck and evaluate the patch to improve shared buffers, WAL buffers, ProcArray, etc.

Regards
Takayuki Tsunakawa

#38Pavel Stehule
pavel.stehule@gmail.com
In reply to: Tsunakawa, Takayuki (#37)
Re: [Proposal] Add accumulated statistics

pá 11. 1. 2019 v 2:10 odesílatel Tsunakawa, Takayuki <
tsunakawa.takay@jp.fujitsu.com> napsal:

From: Robert Haas [mailto:robertmhaas@gmail.com]

My theory is that the number of wait events is NOT useful information,
or at least not nearly as useful the results of a sampling approach.
The data that LWLOCK_STATS produce are downright misleading -- they
lead you to think that the bottlenecks are in different places than
they really are, because the locks that produce the most waiting can
be 5th or 10th in terms of the number of wait events.

I understood you're saying that the number of waits alone does not
necessarily indicate the bottleneck, because a wait with fewer counts but
longer time can take a large portion of the entire SQL execution time. So,
wait time is also useful. I think that's why Oracle describes and MySQL
provides precise count and time without sampling.

the cumulated lock statistics maybe doesn't help with debugging - but it is
very good indicator of database (in production usage) health.

Regards

Pavel

Show quoted text
#39Tsunakawa, Takayuki
tsunakawa.takay@jp.fujitsu.com
In reply to: Pavel Stehule (#38)
RE: [Proposal] Add accumulated statistics

From: Pavel Stehule [mailto:pavel.stehule@gmail.com]

the cumulated lock statistics maybe doesn't help with debugging - but it
is very good indicator of database (in production usage) health.

I think it will help both. But I don't think the sampling won't be as helpful as the precise lock statistics accumulation, because the sampling doesn't give us exactly how effective our improvements to PostgreSQL code are. I remember PG developers used LOCK_STATS to see how many (or ratio of) lwlock waits decreased by applying patches.

We can use the cumulated lock stats like:

1. SELECT * FROM pg_session_waits;
2. Run a benchmark.
3. SELECT * FROM pg_session_waits;
4. Calculate the difference between 1 and 3.

Or, reset the wait stats before the benchmark run and just use the stats as-is.

I'd like to know why you thought the cumulated wait stats isn't helpful for debugging.

Regards
Takayuki Tsunakawa

#40imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: Tsunakawa, Takayuki (#39)
RE: [Proposal] Add accumulated statistics

Hi,

On Tue, Jan 15, 2019 at 1:14 AM, Tsunakawa, Takayuki wrote:

[ ... absent for a long time ]

I read the discussions of this thread.

If we want wait event info, we can currently do sampling from pg_stat_activity and get pseudo wait event total duration.
(I understand wait event sampling does *not* represent wait event count but wait event duration.)

It is said that, in most cases, sampling is enough to find the cause of issue. On the other hand, it is said that sampling missed wait events that don't spent much time in the process. Can we say we don't need such wait events that seems apparently not important?

Also we can know the pseudo total duration from the sampling though, we can't know the how many times wait events occurred and how long wait events take time in the average each time. I think it would be better to also know wait event average time to investigate the cause of the issue.

For example, counts, total duration and average times of each wait events after executing pgbench, which is gotten from the server applied my patch, is like below.

transaction type: <builtin: TPC-B (sort of)>
scaling factor: 100
query mode: simple
number of clients: 8
number of threads: 8
duration: 60 s
number of transactions actually processed: 132259
latency average = 3.630 ms
tps = 2203.909727 (including connections establishing)
tps = 2204.248691 (excluding connections establishing)

wait_event_type | wait_event | calls | times | times/calls
-----------------+----------------------+--------+-----------+------------
Client | ClientRead | 925732 | 115270950 | 124.52
IO | DataFileRead | 220357 | 3219893 | 14.61
LWLock | WALWriteLock | 195560 | 192684987 | 985.30
IO | DataFileWrite | 156208 | 4440091 | 28.42
IO | WALWrite | 51035 | 2925808 | 57.33
IO | WALSync | 50812 | 49626240 | 976.66
Lock | transactionid | 6179 | 9905727 | 1603.13
LWLock | buffer_content | 5453 | 295123 | 54.12
IO | DataFileExtend | 2367 | 90430 | 38.20
LWLock | wal_insert | 2053 | 33620 | 16.38
LWLock | WALBufMappingLock | 1301 | 23761 | 18.26
LWLock | CLogControlLock | 514 | 3959 | 7.70
LWLock | buffer_mapping | 162 | 1313 | 8.10
LWLock | XidGenLock | 153 | 2399 | 15.68
LWLock | lock_manager | 122 | 5051 | 41.40
Lock | tuple | 105 | 114632 | 1091.73
LWLock | ProcArrayLock | 99 | 1828 | 18.46
LWLock | CheckpointerCommLock | 33 | 854 | 25.88
Lock | extend | 24 | 20493 | 853.88
IO | RelationMapRead | 22 | 71 | 3.23
LWLock | buffer_io | 11 | 7398 | 672.55
IO | SLRUWrite | 4 | 249 | 62.25
IO | SLRUSync | 4 | 0 | 0.00
IPC | ClogGroupUpdate | 3 | 223 | 74.33
LWLock | SyncRepLock | 1 | 1 | 1.00

We can see that WALWriteLock has the longest total duration(times), and ClientRead is the next one. We might get this information from the sampling. We can also see that WALWriteLock longest total duration is because of its high average time, while ClientRead total duration is because of its many counts. We can also know that transactionid, tuple, WALWriteLock, WALSync, extend have the longer average time than the others. If we want to improve the performance even for just a bit, such info is useful I think.

The patch which is proposed in this thread was not accepted because it loads overhead on the server/backend.
The function that measures the wait event time, gettimeofday and clock_gettime, takes time to execute and it is on the frequently called path(pgstat_report_wait_start/end), it is supposed a lot of overhead will be introduced.

Even if we only count the wait event, there would be overhead, which is pointed out in the another thread.[1]/messages/by-id/CA+Tgmobf1NJD+_DfQG5qccG5YFSnxk3CgC2mh0-UHabznCQtYA@mail.gmail.com

The overhead which is induced by getting wait event info was discussed from old times, but I couldn't find the actual measuring results, so I want to measure its overhead.

[1]: /messages/by-id/CA+Tgmobf1NJD+_DfQG5qccG5YFSnxk3CgC2mh0-UHabznCQtYA@mail.gmail.com

--
Yoshikazu Imai

#41imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: imai.yoshikazu@fujitsu.com (#40)
2 attachment(s)
RE: [Proposal] Add accumulated statistics

On Wed, Oct 30, 2019 at 5:51 AM, imai.yoshikazu@fujitsu.com wrote:

The overhead which is induced by getting wait event info was discussed from old times, but I couldn't find the actual
measuring results, so I want to measure its overhead.

And here is the patch which counts the wait event and measuring the wait event time. It is currently like POC and has several things to be improved.

You can get wait event info by executing "select * from pg_stat_waitaccum;" and reset its counters by "select pg_stat_reset_shared('waitaccum');".

I tried to reduce the overhead of counting the wait event. It is difficult to reduce the overhead of measuring wait events times, I made measuring time is configurable like track_io_timing.

In the other DB, they use more light function than gettimeofday or clock_gettime and it is the CPU cycle counter, rdtsc.
So I also created the patch which uses rdtsc for measuring wait events times.
There are some investigations for rdtsc before[1]. If we want to use rdtsc, we need more investigation for which platform can use it or how to prevent time go backwards in the rdtsc and so on.
Here, I wanted to see its overhead, so I didn't care such things in this patch.

I measured its performance with pgbench on a VM machine which has 4 core CPU and 8 GB Mem.
I tested against below four cases.

master: master (2fc2a88)

only counts: 0001 patched, only counts wait events
(with track_wait_timing is off)
counts/time: 0001 patched, counts wait event and measure its time by gettimeofday or clock_gettime
(with track_wait_timing is on)
counts/time(rdtsc): 0001 + 0002 patched, counts wait event and measure its time by rdtsc
(with track_wait_timing is on)

I executed the below pgbench scripts, initializing database with scale 1 or 100 and executing pgbench with standard mode or select-only mode.

[standard mode]
for i in `seq 1 15`
do
pgbench -i -s (1 or 100) -q
pgbench -c 8 -j 8 -n -T 60
done

[select only mode]
pgbench -i -s (1 or 100) -q
for i in `seq 1 10`
do
pgbench -c 8 -j 8 -n -S -T 60
done

The result was below.

[standard, scale 1]
version | TPS | diff(%)
master | 813.82 | 0
only counts | 797.48 | 2.01
counts/time | 833.16 | -2.38
counts/time(rdtsc) | 876.29 | -7.68

[standard, scale 100]
version | TPS | diff(%)
master | 2170.34 | 0
only counts | 2125.37 | 2.07
counts/time | 2147.8 | 1.04
counts/time(rdtsc) | 2187.37 | -0.785

[select-only, scale 1]
version | TPS | diff(%)
master | 28487.6 | 0
only counts | 28481.1 | 0.023
counts/time | 28364.7 | 0.431
counts/time(rdtsc) | 28462.6 | 0.088

[select-only, scale 100]
version | TPS | diff(%)
master | 25767.89 | 0
only counts | 26068.65 | -1.167
counts/time | 25567.69 | 0.777
counts/time(rdtsc) | 25525.26 | 0.942

An example of wait event info after executing pgbench was below.

[standard, scale 100]
number of transactions actually processed: 129844
latency average = 3.697 ms
tps = 2163.667427 (including connections establishing)
tps = 2163.918398 (excluding connections establishing)

wait_event_type | wait_event | calls | times
-----------------+----------------------+--------+-----------
Client | ClientRead | 908807 | 114473878
IO | DataFileRead | 216025 | 2867211
LWLock | WALWriteLock | 191977 | 195192237
IO | DataFileWrite | 154540 | 3406232
IO | WALWrite | 49932 | 2728543
IO | WALSync | 49737 | 49649308
Lock | transactionid | 6209 | 8999545
LWLock | buffer_content | 5337 | 288951
IO | DataFileExtend | 2346 | 90375
LWLock | wal_insert | 2013 | 25141
LWLock | WALBufMappingLock | 630 | 14680
LWLock | CLogControlLock | 454 | 2414
LWLock | buffer_mapping | 170 | 852
LWLock | XidGenLock | 146 | 3268
LWLock | lock_manager | 141 | 5209
Lock | tuple | 112 | 120163
LWLock | ProcArrayLock | 97 | 495
Lock | extend | 42 | 26875
IO | RelationMapRead | 22 | 57
LWLock | CheckpointerCommLock | 18 | 1217
IO | DataFilePrefetch | 18 | 24
IPC | ClogGroupUpdate | 9 | 1867
LWLock | SyncRepLock | 3 | 116
IO | DataFileTruncate | 1 | 116

[select-only, scale 1]
number of transactions actually processed: 1682642
latency average = 0.285 ms
tps = 28043.407989 (including connections establishing)
tps = 28048.158085 (excluding connections establishing)
wait_event_type | wait_event | calls | times
-----------------+-----------------+---------+-----------
Client | ClientRead | 1682661 | 287999638
IO | RelationMapRead | 22 | 54
LWLock | lock_manager | 2 | 2087

[select-only, scale 100]
number of transactions actually processed: 1513536
latency average = 0.317 ms
tps = 25223.558569 (including connections establishing)
tps = 25228.820644 (excluding connections establishing)
wait_event_type | wait_event | calls | times
-----------------+-----------------+---------+-----------
IO | DataFileRead | 2524682 | 14579531
Client | ClientRead | 1513558 | 283968554
LWLock | buffer_mapping | 1225 | 6392
IO | RelationMapRead | 22 | 46
LWLock | buffer_io | 11 | 876
LWLock | lock_manager | 6 | 507

I wanted to measure and compare the performance where wait_event occurs many times, but unfortunately, TPS was not stable and differed in each standard test that wait_event occurs many times that I couldn't get consistent performance tendency between each version.

I need to find more suitable test for clarifying its performance or if there are any good tests, please let me know. Also, any tests are very welcome.

--
Yoshikazu Imai

Attachments:

0001-Adding-the-pg_stat_waitaccum-view-which-shows-counts-v1.patchapplication/octet-stream; name=0001-Adding-the-pg_stat_waitaccum-view-which-shows-counts-v1.patchDownload
From 758cdc0d52550e6930470a8cf6936269b714539c Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 30 Oct 2019 05:36:50 +0000
Subject: [PATCH 1/2] Adding the pg_stat_waitaccum view which shows counts and
 duration of each wait events. Each backend/backgrounds counts and measures
 the time of wait event in every pgstat_report_wait_start and
 pgstat_report_wait_end. They store those info into their local variables and
 send to Statistics Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initialize local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  79 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 597 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 9fe4a47..1ca0bbe 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -946,6 +946,14 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 011076c..5a07a81 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -124,6 +124,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -154,6 +155,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -256,6 +261,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -285,6 +291,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_exit(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 static void pgstat_sighup_handler(SIGNAL_ARGS);
@@ -294,8 +302,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -331,6 +342,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -338,6 +350,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -609,6 +642,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -631,6 +666,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+	
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+	
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -911,6 +1015,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1341,6 +1448,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2625,6 +2734,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4414,6 +4539,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4606,6 +4778,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4901,6 +5077,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5106,6 +5284,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}	
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5158,6 +5373,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5228,6 +5444,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5526,10 +5745,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5580,6 +5802,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5633,6 +5858,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6142,7 +6436,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6322,6 +6629,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index fb0bf44..397d455 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -594,6 +594,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 05240bf..8864fe1 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1970,3 +1970,82 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(HeapTupleGetDatum(
 									  heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		entry = waitaccum_stats->hash->entries[i].entry;
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 31a5ef0..1fb088e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1407,6 +1407,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0fc23e3..52f511c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -556,6 +556,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 58ea5b9..487bc65 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5152,6 +5152,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d8..32c4b5f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -947,6 +1009,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1203,6 +1267,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1210,6 +1276,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
+extern bool pgstat_track_wait_timing;
 extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1227,6 +1294,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1314,6 +1382,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1327,6 +1439,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1346,6 +1460,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1353,6 +1469,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1420,6 +1537,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1434,5 +1552,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f627dfe..5c69e08 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 281e1db..4d03d80 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 210e9cd..da9c288 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2053,6 +2053,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

0002-POC-Changed-measuring-method-of-wait-event-timed-v2.patchapplication/octet-stream; name=0002-POC-Changed-measuring-method-of-wait-event-timed-v2.patchDownload
From 5c3421d9634101fda4646d27681e4c9d14bca176 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 30 Oct 2019 05:38:11 +0000
Subject: [PATCH 2/2] [POC] Changed measuring method of wait event timed from
 INSTR_TIME (which uses gettimeofday or clock_gettime) to rdtsc. This might
 reduce the overhead of measuring overhead.

Any supports like changing clock cycle to actual time or error
handling are not currently implemented.
---
 src/backend/postmaster/pgstat.c      |  8 ++++----
 src/backend/utils/adt/pgstatfuncs.c  |  2 +-
 src/include/pgstat.h                 | 14 +++++++-------
 src/include/portability/instr_time.h | 23 +++++++++++++++++++++++
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5a07a81..b677042 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -157,7 +157,7 @@ static bool pgStatRunningInCollector = false;
 
 WAHash *wa_hash;
 
-instr_time waitStart;
+uint64 waitStart;
 
 /*
  * Structures in which backends store per-table info that's waiting to be
@@ -4578,7 +4578,7 @@ pgstat_send_waitaccum()
 
 		/* Clear wait events information. */
 		entry->calls = 0;
-		INSTR_TIME_SET_ZERO(entry->times);
+		entry->times = 0;
 	}
 
 	if (msg.m_nentries > 0)
@@ -6447,7 +6447,7 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 			entry = hash->entries[i].entry;
 
 			entry->calls = 0;
-			INSTR_TIME_SET_ZERO(entry->times);
+			entry->times = 0;
 		}
 	}
 	/*
@@ -6660,7 +6660,7 @@ pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
 			 * Otherwise add the values to the existing entry.
 			 */
 			entry->calls += m_entry->calls;
-			INSTR_TIME_ADD(entry->times, m_entry->times);
+			entry->times += m_entry->times;
 		}
 	}
 }
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 8864fe1..5ea4140 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2039,7 +2039,7 @@ pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
 
 		values[2] = Int64GetDatum(entry->calls);
 
-		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+		values[3] = UInt64GetDatum(entry->times);
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 32c4b5f..2515893 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1268,7 +1268,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1391,7 +1391,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -1399,15 +1399,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return; 
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1422,7 +1422,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 0f5c161..668fa63 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -57,6 +57,10 @@
 
 #ifndef WIN32
 
+#if defined(__x86_64__) || defined(__i386__)
+#include <x86intrin.h>
+#endif
+
 #ifdef HAVE_CLOCK_GETTIME
 
 /* Use clock_gettime() */
@@ -209,6 +213,8 @@ typedef struct timeval instr_time;
 
 #else							/* WIN32 */
 
+#include <intrin.h>
+
 /* Use QueryPerformanceCounter() */
 
 typedef LARGE_INTEGER instr_time;
@@ -254,3 +260,20 @@ GetTimerFrequency(void)
 	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
 
 #endif							/* INSTR_TIME_H */
+
+
+#ifndef RDTSC_H_
+#define RDTSC_H_
+
+static inline uint64 rdtsc() {
+	uint64 result;
+#if defined(__x86_64__) || defined(__i386__) || defined(WIN32)
+	result = __rdtsc();
+#else
+	result = 0;
+#endif
+
+	return result;
+}
+
+#endif
-- 
1.8.3.1

#42Pavel Stehule
pavel.stehule@gmail.com
In reply to: Tsunakawa, Takayuki (#39)
Re: [Proposal] Add accumulated statistics

út 15. 1. 2019 v 2:14 odesílatel Tsunakawa, Takayuki <
tsunakawa.takay@jp.fujitsu.com> napsal:

From: Pavel Stehule [mailto:pavel.stehule@gmail.com]

the cumulated lock statistics maybe doesn't help with debugging - but it
is very good indicator of database (in production usage) health.

I think it will help both. But I don't think the sampling won't be as
helpful as the precise lock statistics accumulation, because the sampling
doesn't give us exactly how effective our improvements to PostgreSQL code
are. I remember PG developers used LOCK_STATS to see how many (or ratio
of) lwlock waits decreased by applying patches.

We can use the cumulated lock stats like:

1. SELECT * FROM pg_session_waits;
2. Run a benchmark.
3. SELECT * FROM pg_session_waits;
4. Calculate the difference between 1 and 3.

Or, reset the wait stats before the benchmark run and just use the stats
as-is.

I'd like to know why you thought the cumulated wait stats isn't helpful
for debugging.

I don't remember my thoughts, and maybe I used wrong sentences - usually
lock times are very small, and very unstable if you has too detailed level.
But if you use aggregated values per some longer time window, then these
values can be stable and very interesting. More - usually lock time has
correlation with database (application) health.

Like you I don't think so sampled values are too helpful.

Regards

Pavel

Show quoted text

Regards
Takayuki Tsunakawa

#43Michael Paquier
michael@paquier.xyz
In reply to: imai.yoshikazu@fujitsu.com (#41)
Re: [Proposal] Add accumulated statistics

On Wed, Oct 30, 2019 at 05:55:28AM +0000, imai.yoshikazu@fujitsu.com wrote:

And here is the patch which counts the wait event and measuring the wait event time. It is currently like POC and has several things to be improved.

Please note the patch tester complains about the latest patch:
pgstatfuncs.c: In function ‘pg_stat_get_waitaccum’:
pgstatfuncs.c:2018:3: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement]
Datum values[PG_STAT_GET_WAITACCUM_COLS];

I am moving it to next CF, marking it as waiting on author.
--
Michael

#44imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: imai.yoshikazu@fujitsu.com (#41)
2 attachment(s)
RE: [Proposal] Add accumulated statistics

On Sun, Dec 1, 2019 at 1:10 AM, Michael Paquier wrote:

On Wed, Oct 30, 2019 at 05:55:28AM +0000, imai(dot)yoshikazu(at)fujitsu(dot)com wrote:

And here is the patch which counts the wait event and measuring the wait event time. It is currently like POC and has several things to be improved.

Please note the patch tester complains about the latest patch:
pgstatfuncs.c: In function ‘pg_stat_get_waitaccum’:
pgstatfuncs.c:2018:3: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement]
Datum values[PG_STAT_GET_WAITACCUM_COLS];

I am moving it to next CF, marking it as waiting on author.

Sorry for late reply.

Unfortunately, I couldn't get your original mail even if I used "Resend email"
on pgsql-hackers archive, that I reply to my old mail.

I attach the patch I solved the warning.

--
Yoshikazu Imai

Attachments:

0001-Adding-the-pg_stat_waitaccum-view-which-shows-counts-v3.patchapplication/octet-stream; name=0001-Adding-the-pg_stat_waitaccum-view-which-shows-counts-v3.patchDownload
From 0694b70d681186ad10af2837560bac8ab1cba8b2 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Thu, 19 Dec 2019 04:28:28 +0000
Subject: [PATCH 1/2] Adding the pg_stat_waitaccum view which shows counts and
 duration of each wait events. Each backend/backgrounds counts and measures
 the time of wait event in every pgstat_report_wait_start and
 pgstat_report_wait_end. They store those info into their local variables and
 send to Statistics Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initialize local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index f7800f0..976ad98 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -951,6 +951,14 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index e931512..1454e77 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+	
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+	
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4407,6 +4532,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4599,6 +4771,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4869,6 +5045,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5074,6 +5252,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}	
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5126,6 +5341,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5196,6 +5412,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5494,10 +5713,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5548,6 +5770,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5601,6 +5826,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6110,7 +6404,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6290,6 +6597,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 18e3843..8f5b0ba 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -594,6 +594,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 05240bf..b408db3 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1970,3 +1970,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(HeapTupleGetDatum(
 									  heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8d951ce..ebae427 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1424,6 +1424,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 087190c..070c213 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -569,6 +569,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ac8f64b..9d7c2e8 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5152,6 +5152,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index fe076d8..32c4b5f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -947,6 +1009,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1203,6 +1267,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1210,6 +1276,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
+extern bool pgstat_track_wait_timing;
 extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1227,6 +1294,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1314,6 +1382,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return; 
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1327,6 +1439,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1346,6 +1460,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1353,6 +1469,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1420,6 +1537,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1434,5 +1552,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index f9450da..cf2e953 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 281e1db..4d03d80 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 80a0782..b524fae 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2058,6 +2058,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

0002-POC-Changed-measuring-method-of-wait-event-timed-fro-v3.patchapplication/octet-stream; name=0002-POC-Changed-measuring-method-of-wait-event-timed-fro-v3.patchDownload
From 00cac6dce8b8a533e1fad33035a41aafc79d657c Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Thu, 19 Dec 2019 04:47:16 +0000
Subject: [PATCH 2/2] [POC] Changed measuring method of wait event timed from
 INSTR_TIME (which uses gettimeofday or clock_gettime) to rdtsc. This might
 reduce the overhead of measuring overhead.

Any supports like changing clock cycle to actual time or error
handling are not currently implemented.
---
 src/backend/postmaster/pgstat.c      |  8 ++++----
 src/backend/utils/adt/pgstatfuncs.c  |  2 +-
 src/include/pgstat.h                 | 14 +++++++-------
 src/include/portability/instr_time.h | 23 +++++++++++++++++++++++
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 1454e77..f9d72e7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -156,7 +156,7 @@ static bool pgStatRunningInCollector = false;
 
 WAHash *wa_hash;
 
-instr_time waitStart;
+uint64 waitStart;
 
 /*
  * Structures in which backends store per-table info that's waiting to be
@@ -4571,7 +4571,7 @@ pgstat_send_waitaccum()
 
 		/* Clear wait events information. */
 		entry->calls = 0;
-		INSTR_TIME_SET_ZERO(entry->times);
+		entry->times = 0;
 	}
 
 	if (msg.m_nentries > 0)
@@ -6415,7 +6415,7 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 			entry = hash->entries[i].entry;
 
 			entry->calls = 0;
-			INSTR_TIME_SET_ZERO(entry->times);
+			entry->times = 0;
 		}
 	}
 	/*
@@ -6628,7 +6628,7 @@ pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
 			 * Otherwise add the values to the existing entry.
 			 */
 			entry->calls += m_entry->calls;
-			INSTR_TIME_ADD(entry->times, m_entry->times);
+			entry->times += m_entry->times;
 		}
 	}
 }
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index b408db3..f759c7d 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2040,7 +2040,7 @@ pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
 
 		values[2] = Int64GetDatum(entry->calls);
 
-		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+		values[3] = UInt64GetDatum(entry->times);
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 32c4b5f..2515893 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1268,7 +1268,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1391,7 +1391,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -1399,15 +1399,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return; 
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1422,7 +1422,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 0f5c161..668fa63 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -57,6 +57,10 @@
 
 #ifndef WIN32
 
+#if defined(__x86_64__) || defined(__i386__)
+#include <x86intrin.h>
+#endif
+
 #ifdef HAVE_CLOCK_GETTIME
 
 /* Use clock_gettime() */
@@ -209,6 +213,8 @@ typedef struct timeval instr_time;
 
 #else							/* WIN32 */
 
+#include <intrin.h>
+
 /* Use QueryPerformanceCounter() */
 
 typedef LARGE_INTEGER instr_time;
@@ -254,3 +260,20 @@ GetTimerFrequency(void)
 	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
 
 #endif							/* INSTR_TIME_H */
+
+
+#ifndef RDTSC_H_
+#define RDTSC_H_
+
+static inline uint64 rdtsc() {
+	uint64 result;
+#if defined(__x86_64__) || defined(__i386__) || defined(WIN32)
+	result = __rdtsc();
+#else
+	result = 0;
+#endif
+
+	return result;
+}
+
+#endif
-- 
1.8.3.1

#45Pavel Stehule
pavel.stehule@gmail.com
In reply to: imai.yoshikazu@fujitsu.com (#44)
Re: [Proposal] Add accumulated statistics for wait event

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: not tested
Documentation: tested, passed

I like this patch, because I used similar functionality some years ago very successfully. The implementation is almost simple, and the result should be valid by used method.

The potential problem is performance impact. Very early test show impact cca 3% worst case, but I'll try to repeat these tests.

There are some ending whitespaces and useless tabs.

The new status of this patch is: Waiting on Author

#46Imai Yoshikazu
yoshikazu_i443@live.jp
In reply to: Pavel Stehule (#45)
2 attachment(s)
Re: [Proposal] Add accumulated statistics for wait event

On 2020/01/13 4:11, Pavel Stehule wrote:

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: not tested
Documentation: tested, passed

I like this patch, because I used similar functionality some years ago very successfully. The implementation is almost simple, and the result should be valid by used method.

Thanks for your review!

The potential problem is performance impact. Very early test show impact cca 3% worst case, but I'll try to repeat these tests.

Yes, performance impact is the main concern. I want to know how it
affects performance in various test cases or on various environments.

There are some ending whitespaces and useless tabs.

The new status of this patch is: Waiting on Author

I attach v4 patches removing those extra whitespaces of the end of lines
and useless tabs.

--
Yoshikazu Imai

Attachments:

0001-Add-pg_stat_waitaccum-view-v4.patchtext/plain; name=0001-Add-pg_stat_waitaccum-view-v4.patchDownload
From b009b1f8f6be47ae61b5e4538e2730d721ee60db Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 09:13:19 +0000
Subject: [PATCH v4 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initializes local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 773edf8..80f2caa 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -957,6 +957,14 @@ CREATE VIEW pg_stat_bgwriter AS
         pg_stat_get_buf_alloc() AS buffers_alloc,
         pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 51c486b..08e10ad 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4410,6 +4535,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4602,6 +4774,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4872,6 +5048,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5077,6 +5255,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5129,6 +5344,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5199,6 +5415,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5497,10 +5716,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5551,6 +5773,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5604,6 +5829,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6113,7 +6407,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6293,6 +6600,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index d07ce60..6f4eb19 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -594,6 +594,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 3dbf604..bed7d01 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1974,3 +1974,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(HeapTupleGetDatum(
 									  heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e5f8a13..2924472 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1424,6 +1424,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e1048c0..3a99182 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -570,6 +570,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 427faa3..4e5a502 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5159,6 +5159,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index e5a5d02..f90bb44 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -948,6 +1010,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1204,6 +1268,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1211,6 +1277,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
+extern bool pgstat_track_wait_timing;
 extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1228,6 +1295,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1315,6 +1383,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1328,6 +1440,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1347,6 +1461,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1354,6 +1470,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1421,6 +1538,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1435,5 +1553,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8fda8e4..2149c96 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 5b407e6..bd47ccb 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 62eaf90..82566d0 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2063,6 +2063,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

0002-POC-Change-measuring-method-of-wait-event-time-fr-v4.patchtext/plain; name=0002-POC-Change-measuring-method-of-wait-event-time-fr-v4.patchDownload
From 4b9ac96fbf66222ae5fca60e5eed02209b42d1c8 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 12:42:58 +0000
Subject: [PATCH v4 2/2] [POC] Change measuring method of wait event time from
 INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.
---
 src/include/pgstat.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index f90bb44..58fa1f7 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1269,7 +1269,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1392,7 +1392,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -1400,15 +1400,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return;
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1423,7 +1423,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
-- 
1.8.3.1

#47Pavel Stehule
pavel.stehule@gmail.com
In reply to: Imai Yoshikazu (#46)
1 attachment(s)
Re: [Proposal] Add accumulated statistics for wait event

Hi

st 15. 1. 2020 v 14:15 odesílatel Imai Yoshikazu <yoshikazu_i443@live.jp>
napsal:

On 2020/01/13 4:11, Pavel Stehule wrote:

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: not tested
Documentation: tested, passed

I like this patch, because I used similar functionality some years ago

very successfully. The implementation is almost simple, and the result
should be valid by used method.

Thanks for your review!

The potential problem is performance impact. Very early test show impact

cca 3% worst case, but I'll try to repeat these tests.

Yes, performance impact is the main concern. I want to know how it
affects performance in various test cases or on various environments.

There are some ending whitespaces and useless tabs.

The new status of this patch is: Waiting on Author

I attach v4 patches removing those extra whitespaces of the end of lines
and useless tabs.

today I run 120 5minutes pgbench tests to measure impact of this patch.
Result is attached.

test

PSQL="/usr/local/pgsql/bin/psql"
PGBENCH="/usr/local/pgsql/bin/pgbench"
export PGDATABASE=postgres

echo "******* START *******" > ~/result.txt

for i in 1 5 10 50 100
do
echo "scale factor $i" >> ~/result.txt

$PSQL -c "create database bench$i"
$PGBENCH -i -s $i "bench$i"

for c in 1 5 10 50
do
$PGBENCH -c $c -T 300 "bench$i" >> ~/result.txt
done

$PSQL -c "vacuum full" "bench$i"
$PSQL -c "vacuum analyze" "bench$i"

for c in 1 5 10 50
do
$PGBENCH -S -c $c -T 300 "bench$i" >> ~/result.txt
done

$PSQL -c "drop database bench$i"
done

Tested on computer with 4CPU, 8GB RAM - configuration: shared buffers 2GB,
work mem 20MB

The result is interesting - when I run pgbench in R/W mode, then I got +/-
1% changes in performance. Isn't important if tracking time is active or
not (tested on Linux). In this mode the new code is not on critical path.

More interesting results are from read only tests (there are visible some
higher differences)

for scale 5/ and 50 users - the tracking time increase performance about
12% (same result I got for scale/users 10/50), in other direction patched
but without tracking time decreases performance about 10% for for 50/50
(with without tracking time) and 100/5

Looks so for higher scale than 5 and higher number of users 50 the results
are not too much stable (for read only tests - I repeated tests) and there
overhead is about 10% from 60K tps to 55Ktps - maybe I hit a hw limits (it
running with 4CPU)

Thanks to Tomas Vondra and 2ndq for hw for testing

Regards

Pavel

wait_event_type | wait_event | calls | times
-----------------+-----------------------+------------+--------------
Client | ClientRead | 1489681408 | 221616362961
Lock | transactionid | 103113369 | 71918794185
LWLock | WALWriteLock | 104781468 | 20865855903
Lock | tuple | 21323744 | 15800875242
IO | WALSync | 50862170 | 8666988491
LWLock | lock_manager | 18415423 | 575308266
IO | WALWrite | 51482764 | 205775892
LWLock | buffer_content | 15385387 | 168446128
LWLock | wal_insert | 1502092 | 90019731
IPC | ProcArrayGroupUpdate | 178238 | 46527364
LWLock | ProcArrayLock | 587356 | 13298246
IO | DataFileExtend | 2715557 | 11615216
IPC | ClogGroupUpdate | 54319 | 10622013
IO | DataFileRead | 5805298 | 9596545
IO | SLRURead | 9518930 | 7166217
LWLock | CLogControlLock | 746759 | 6783602

Show quoted text

--
Yoshikazu Imai

Attachments:

pgbench.odsapplication/vnd.oasis.opendocument.spreadsheet; name=pgbench.odsDownload
#48Tomas Vondra
tomas.vondra@2ndquadrant.com
In reply to: Pavel Stehule (#47)
Re: [Proposal] Add accumulated statistics for wait event

This patch was in WoA, but that was wrong I think - we got a patch on
January 15, followed by a benchmark by Pavel Stehule, so I think it
should still be in "needs review". So I've updated it and moved it to
the next CF.

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#49Pavel Stehule
pavel.stehule@gmail.com
In reply to: Tomas Vondra (#48)
Re: [Proposal] Add accumulated statistics for wait event

so 1. 2. 2020 v 12:34 odesílatel Tomas Vondra <tomas.vondra@2ndquadrant.com>
napsal:

This patch was in WoA, but that was wrong I think - we got a patch on
January 15, followed by a benchmark by Pavel Stehule, so I think it
should still be in "needs review". So I've updated it and moved it to
the next CF.

currently this patch needs a rebase

Pavel

Show quoted text

regards

--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#50imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: Pavel Stehule (#47)
2 attachment(s)
RE: [Proposal] Add accumulated statistics for wait event

On Sat, Feb 1, 2020 at 5:50 AM, Pavel Stehule wrote:

today I run 120 5minutes pgbench tests to measure impact of this patch. Result is attached.

...

Thanks to Tomas Vondra and 2ndq for hw for testing

Thank you for doing a lot of these benchmarks!

The result is interesting - when I run pgbench in R/W mode, then I got +/- 1% changes in performance. Isn't important if
tracking time is active or not (tested on Linux). In this mode the new code is not on critical path.

It seems performance difference is big in case of read only tests. The reason is that write time is relatively longer than the
processing time of the logic I added in the patch.

Looks so for higher scale than 5 and higher number of users 50 the results are not too much stable (for read only tests - I
repeated tests) and there overhead is about 10% from 60K tps to 55Ktps - maybe I hit a hw limits (it running with 4CPU)

Yes, I suspect some other bottlenecks may be happened and it causes the results unstable. However, it may be better to
investigate what is actually happened and why performance is increased/decreased for over 10%. I will inspect it.

Also I attach v5 patches which corresponds to other committed patches.

--
Yoshikazu Imai

Attachments:

0001-Add-pg_stat_waitaccum-view-v5.patchapplication/octet-stream; name=0001-Add-pg_stat_waitaccum-view-v5.patchDownload
From 919ca45c748eff6f707233b62c735d8e173a1b86 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 09:13:19 +0000
Subject: [PATCH v4 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initializes local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index f681aaf..bc4faad 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -980,6 +980,14 @@ CREATE VIEW pg_stat_progress_analyze AS
     FROM pg_stat_get_progress_info('ANALYZE') AS S
         LEFT JOIN pg_database D ON S.datid = D.oid;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 7169509..ceddfda 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4410,6 +4535,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4600,6 +4772,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4868,6 +5044,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5073,6 +5251,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5125,6 +5340,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5195,6 +5411,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5493,10 +5712,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5547,6 +5769,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5600,6 +5825,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6109,7 +6403,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6289,6 +6596,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 4c14e51..229993e 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -595,6 +595,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 7e6a3c1..de0e1bf 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1982,3 +1982,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8228e1f..8b07a97 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1483,6 +1483,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e1048c0..3a99182 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -570,6 +570,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 226c904..a56baf3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5179,6 +5179,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index aecb601..2f48282 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_GSS_OPEN_SERVER,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -948,6 +1010,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1205,6 +1269,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1212,6 +1278,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern PGDLLIMPORT bool pgstat_track_activities;
 extern PGDLLIMPORT bool pgstat_track_counts;
+extern PGDLLIMPORT bool pgstat_track_wait_timing;
 extern PGDLLIMPORT int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1229,6 +1296,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1316,6 +1384,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1329,6 +1441,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1348,6 +1462,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1355,6 +1471,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1422,6 +1539,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1436,5 +1554,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8fda8e4..2149c96 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index d217801..751f77a 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 634f825..2b98c4b 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2087,6 +2087,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

0002-POC-Change-measuring-method-of-wait-event-time-fr-v5.patchapplication/octet-stream; name=0002-POC-Change-measuring-method-of-wait-event-time-fr-v5.patchDownload
From 4f89046235b95c5154a6ce38f6774573163e3ff0 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Wed, 15 Jan 2020 12:42:58 +0000
Subject: [PATCH v4 2/2] [POC] Change measuring method of wait event time from
 INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.
---
 src/include/pgstat.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 2f48282..4547bdb 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1270,7 +1270,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1393,7 +1393,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -1401,15 +1401,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return;
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1424,7 +1424,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
-- 
1.8.3.1

#51Craig Ringer
craig@2ndquadrant.com
In reply to: imai.yoshikazu@fujitsu.com (#50)
Re: [Proposal] Add accumulated statistics for wait event

On Wed, 12 Feb 2020 at 12:36, imai.yoshikazu@fujitsu.com
<imai.yoshikazu@fujitsu.com> wrote:

It seems performance difference is big in case of read only tests. The reason is that write time is relatively longer than the
processing time of the logic I added in the patch.

That's going to be a pretty difficult performance hit to justify.

Can we buffer collected wait events locally and spit the buffer to the
stats collector at convenient moments? We can use a limited buffer
size with an overflow flag, so we degrade the results rather than
falling over or forcing excessive stats reporting at inappropriate
times.

I suggest that this is also a good opportunity to add some more
tracepoints to PostgreSQL. The wait events facilities are not very
traceable right now. Exposing some better TRACE_POSTGRESQL_
tracepoints (SDTs) via probes.d would help users collect better
information using external tools like perf, bpftrace and systemtap.
That way we have a zero-overhead-when-unused option that can also be
used to aggregate the information per-query, per-user, etc.

(I really need to add a bunch more tracepoints to make this easier...)

--
Craig Ringer http://www.2ndQuadrant.com/
2ndQuadrant - PostgreSQL Solutions for the Enterprise

#52imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: Craig Ringer (#51)
RE: [Proposal] Add accumulated statistics for wait event

On Wed, Feb 12, 2020 at 5:42 AM, Craig Ringer wrote:

It seems performance difference is big in case of read only tests. The reason is that write time is relatively longer than the
processing time of the logic I added in the patch.

That's going to be a pretty difficult performance hit to justify.

Can we buffer collected wait events locally and spit the buffer to the
stats collector at convenient moments? We can use a limited buffer
size with an overflow flag, so we degrade the results rather than
falling over or forcing excessive stats reporting at inappropriate

times.

IIUC, currently each backend collects wait events locally. When each
backend goes to idle (telling the frontend that it is ready-for-query), it
reports wait event statistics to the stats collector. The interval of each
report is over than PGSTAT_STAT_INTERVAL(default 500ms). Also when
each backend exits, it also does report.

So if we do the read only test with 50 clients, each 50 backend reports
wait events statistics to the stats collector for almost every 500ms. If
that causes performance degradation, we can improve performance by
letting backends to report its statistics, for example, only at the
backend's exit.

(I think I can easily test this by building postgres with setting
PGSTAT_STAT_INTERVAL to a large value >> 500ms.)

I suggest that this is also a good opportunity to add some more
tracepoints to PostgreSQL. The wait events facilities are not very
traceable right now.

Does that mean we will add TRACE_POSTGRESQL_ to every before/after
pgstat_report_wait_start?

That way we have a zero-overhead-when-unused option that can also be
used to aggregate the information per-query, per-user, etc.

I see. In that way, we can accomplish no overhead when DTrace is not
enabled and what we can measure is more customizable.

It is also curious how will overhead be if we implement wait events
statistics on DTrace scripts though I can't imagine how it will be because
I haven't used DTrace.

--
Yoshikazu Imai

#53imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: imai.yoshikazu@fujitsu.com (#52)
2 attachment(s)
RE: [Proposal] Add accumulated statistics for wait event

On Fri, Feb 14, 2020 at 11:59 AM, 王胜利 wrote:

I am glad to know you are working on PG accumulated statistics feature, and I am interested on it.
I see these two patch file you made, can you let me know which branch of PG code based?

when I use this: https://github.com/postgres/postgres/commits/master, and apply these patches, report some error.

Thanks for Wang's mail, I noticed my 0002 patch was wrong from v3.

Here, I attach correct patches.

Also I will begin to do some benchmark with higher scale and higher number of
users and try to change stats reporting implementation to not affect
performance, which I couldn't have not started because of another tasks.

--
Yoshikazu Imai

Attachments:

0001-Add-pg_stat_waitaccum-view-v6.patchapplication/octet-stream; name=0001-Add-pg_stat_waitaccum-view-v6.patchDownload
From 5bd0684dbb24dd5c76b8d35d1da33b247513a8b0 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Tue, 25 Feb 2020 07:24:01 +0000
Subject: [PATCH v6 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initializes local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/catalog/system_views.sql          |   8 +
 src/backend/postmaster/pgstat.c               | 344 ++++++++++++++++++++++++++
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/adt/pgstatfuncs.c           |  80 ++++++
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          | 123 ++++++++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/test/regress/expected/rules.out           |   5 +
 11 files changed, 598 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index f681aaf..bc4faad 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -980,6 +980,14 @@ CREATE VIEW pg_stat_progress_analyze AS
     FROM pg_stat_get_progress_info('ANALYZE') AS S
         LEFT JOIN pg_database D ON S.datid = D.oid;
 
+CREATE VIEW pg_stat_waitaccum AS
+    SELECT
+		S.wait_event_type AS wait_event_type,
+		S.wait_event AS wait_event,
+		S.calls AS calls,
+		S.times AS times
+	FROM pg_stat_get_waitaccum(NULL) AS S;
+
 CREATE VIEW pg_stat_progress_vacuum AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 59dc4f3..5c2f125 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -123,6 +123,7 @@
  */
 bool		pgstat_track_activities = false;
 bool		pgstat_track_counts = false;
+bool		pgstat_track_wait_timing = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
 int			pgstat_track_activity_query_size = 1024;
 
@@ -153,6 +154,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -255,6 +260,7 @@ static int	localNumBackends = 0;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -280,6 +286,8 @@ static pid_t pgstat_forkexec(void);
 #endif
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
@@ -287,8 +295,11 @@ static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_waitaccum_statsfile(FILE *fpout);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static bool pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+											FILE *fpin, const char *statfile);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
@@ -324,6 +335,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
@@ -331,6 +343,27 @@ static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -602,6 +635,8 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -624,6 +659,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	//do extension stuff
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -904,6 +1008,9 @@ pgstat_report_stat(bool force)
 
 	/* Now, send function statistics */
 	pgstat_send_funcstats();
+
+	/* Send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1334,6 +1441,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_ARCHIVER;
 	else if (strcmp(target, "bgwriter") == 0)
 		msg.m_resettarget = RESET_BGWRITER;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2618,6 +2727,22 @@ pgstat_fetch_global(void)
 	return &globalStats;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -4410,6 +4535,53 @@ pgstat_send_bgwriter(void)
 	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -4600,6 +4772,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -4868,6 +5044,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
 	(void) rc;					/* we'll check for error with ferror */
 
+	pgstat_write_waitaccum_statsfile(fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -5073,6 +5251,43 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 }
 
 /* ----------
+ * pgstat_write_waitaccum_statsfile() -
+ *		Write the waitAccumStats to the stat file.
+ *
+ * ----------
+ */
+static void
+pgstat_write_waitaccum_statsfile(FILE *fpout)
+{
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			rc;
+	int			i;
+
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < hash->entry_num; i++)
+	{
+		entry = hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
+		 */
+		fputc('D', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}
+
+	fputc('E', fpout);
+}
+
+/* ----------
  * pgstat_read_statsfiles() -
  *
  *	Reads in some existing statistics collector files and returns the
@@ -5125,6 +5340,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	 */
 	memset(&globalStats, 0, sizeof(globalStats));
 	memset(&archiverStats, 0, sizeof(archiverStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext, sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -5195,6 +5411,9 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 		goto done;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&waitAccumStats, fpin, statfile))
+		goto done;
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -5493,10 +5712,13 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	PgStat_ArchiverStats myArchiverStats;
+	PgStat_WaitAccumStats myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
+	myWaitAccumStats.hash = MemoryContextAllocZero(CurrentMemoryContext, sizeof(WAHash));
+
 	/*
 	 * Try to open the stats file.  As above, anything but ENOENT is worthy of
 	 * complaining about.
@@ -5547,6 +5769,9 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 		return false;
 	}
 
+	if(!pgstat_read_waitaccum_statsfile(&myWaitAccumStats, fpin, statfile))
+		return false;
+
 	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
@@ -5600,6 +5825,75 @@ done:
 	return true;
 }
 
+/* ----------
+ * pgstat_read_statsfiles() -
+ *
+ *	Reads the waitaccum stats from the file.
+ *	If an error happens when reading file, return false. Otherwise return true.
+ *
+ * ----------
+ */
+static bool
+pgstat_read_waitaccum_statsfile(PgStat_WaitAccumStats *stats,
+								FILE *fpin, const char *statfile)
+{
+	PgStat_WaitAccumEntry *entry;
+	PgStat_WaitAccumEntry buf;
+	WAHash *hash = stats->hash;
+
+	/*
+	 * Read and put all the hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_WaitAccumEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+									 != sizeof(PgStat_WaitAccumEntry))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+				if (entry)
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					return false;
+				}
+
+				/*
+				 * Add to the DB hash
+				 */
+				entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+				memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+				break;
+
+			case 'E':
+				return true;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				return false;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -6109,7 +6403,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&archiverStats, 0, sizeof(archiverStats));
 		archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
 
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 	/*
 	 * Presumably the sender of this message validated the target, don't
 	 * complain here if it's not valid
@@ -6289,6 +6596,43 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
+/* ----------
  * pgstat_recv_recoveryconflict() -
  *
  *	Process a RECOVERYCONFLICT message.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 4c14e51..229993e 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -595,6 +595,25 @@ LWLockNewTrancheId(void)
 }
 
 /*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
  * Register a tranche ID in the lookup table for the current process.  This
  * routine will save a pointer to the tranche name passed as an argument,
  * so the name should be allocated in a backend-lifetime context
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 7e6a3c1..de0e1bf 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1982,3 +1982,83 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS)
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8228e1f..8b07a97 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1483,6 +1483,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e1048c0..3a99182 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -570,6 +570,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index eb3c1a8..7b1b3d9 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5179,6 +5179,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,sslcompression,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '2228',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 3a65a51..e5dbcb4 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -59,6 +59,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_ANALYZE,
 	PGSTAT_MTYPE_ARCHIVER,
 	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -119,7 +120,8 @@ typedef struct PgStat_TableCounts
 typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
-	RESET_BGWRITER
+	RESET_BGWRITER,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
@@ -423,6 +425,33 @@ typedef struct PgStat_MsgBgWriter
 } PgStat_MsgBgWriter;
 
 /* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
+
+/* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
@@ -564,6 +593,7 @@ typedef union PgStat_Msg
 	PgStat_MsgAnalyze msg_analyze;
 	PgStat_MsgArchiver msg_archiver;
 	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -581,7 +611,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9E
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -711,6 +741,30 @@ typedef struct PgStat_GlobalStats
 	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
 
 /* ----------
  * Backend types
@@ -787,6 +841,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -808,6 +864,8 @@ typedef enum
 	WAIT_EVENT_WAL_SENDER_WRITE_DATA,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_GSS_OPEN_SERVER
+
 /* ----------
  * Wait Events - IPC
  *
@@ -856,6 +914,8 @@ typedef enum
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_SYNC_REP
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -869,6 +929,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_RECOVERY_APPLY_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -948,6 +1010,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_WAL_WRITE
+
 /* ----------
  * Command type for progress reporting purposes
  * ----------
@@ -1205,6 +1269,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern WAHash *wa_hash;
+extern instr_time waitStart;
 
 /* ----------
  * GUC parameters
@@ -1212,6 +1278,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern PGDLLIMPORT bool pgstat_track_activities;
 extern PGDLLIMPORT bool pgstat_track_counts;
+extern PGDLLIMPORT bool pgstat_track_wait_timing;
 extern PGDLLIMPORT int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
@@ -1229,6 +1296,7 @@ extern PgStat_MsgBgWriter BgWriterStats;
 extern PgStat_Counter pgStatBlockReadTime;
 extern PgStat_Counter pgStatBlockWriteTime;
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
 /* ----------
  * Functions called from postmaster
  * ----------
@@ -1316,6 +1384,50 @@ extern char *pgstat_clip_activity(const char *raw_activity);
  * initialized.
  * ----------
  */
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end(uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
+
+	if (!entry)
+	{
+		printf("wait_event_info: %u.\n", wait_event_info);
+		fflush(stdout);
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
@@ -1329,6 +1441,8 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	proc->wait_event_info = wait_event_info;
+
+	pgstat_report_waitaccum_start();
 }
 
 /* ----------
@@ -1348,6 +1462,8 @@ pgstat_report_wait_end(void)
 	if (!pgstat_track_activities || !proc)
 		return;
 
+	pgstat_report_waitaccum_end(proc->wait_event_info);
+
 	/*
 	 * Since this is a four-byte field which is always read and written as
 	 * four-bytes, updates are atomic.
@@ -1355,6 +1471,7 @@ pgstat_report_wait_end(void)
 	proc->wait_event_info = 0;
 }
 
+
 /* nontransactional event counts are simple enough to inline */
 
 #define pgstat_count_heap_scan(rel)									\
@@ -1422,6 +1539,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1436,5 +1554,6 @@ extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8fda8e4..2149c96 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -184,6 +184,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index d217801..751f77a 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 634f825..2b98c4b 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2087,6 +2087,11 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
1.8.3.1

0002-POC-Change-measuring-method-of-wait-event-time-fr-v6.patchapplication/octet-stream; name=0002-POC-Change-measuring-method-of-wait-event-time-fr-v6.patchDownload
From cba515bd4d23e0b860f8b17a92932a3d7b3ab675 Mon Sep 17 00:00:00 2001
From: "imai.yoshikazu" <imai.yoshikazu@jp.fujitsu.com>
Date: Tue, 25 Feb 2020 07:25:26 +0000
Subject: [PATCH v6 2/2] [POC] Change measuring method of wait event time from
 INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.
---
 src/backend/postmaster/pgstat.c      |  8 ++++----
 src/backend/utils/adt/pgstatfuncs.c  |  2 +-
 src/include/pgstat.h                 | 14 +++++++-------
 src/include/portability/instr_time.h | 21 +++++++++++++++++++++
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5c2f125..19c0017 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -156,7 +156,7 @@ static bool pgStatRunningInCollector = false;
 
 WAHash *wa_hash;
 
-instr_time waitStart;
+uint64 waitStart;
 
 /*
  * Structures in which backends store per-table info that's waiting to be
@@ -4574,7 +4574,7 @@ pgstat_send_waitaccum()
 
 		/* Clear wait events information. */
 		entry->calls = 0;
-		INSTR_TIME_SET_ZERO(entry->times);
+		entry->times = 0;
 	}
 
 	if (msg.m_nentries > 0)
@@ -6414,7 +6414,7 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 			entry = hash->entries[i].entry;
 
 			entry->calls = 0;
-			INSTR_TIME_SET_ZERO(entry->times);
+			entry->times = 0;
 		}
 	}
 	/*
@@ -6627,7 +6627,7 @@ pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
 			 * Otherwise add the values to the existing entry.
 			 */
 			entry->calls += m_entry->calls;
-			INSTR_TIME_ADD(entry->times, m_entry->times);
+			entry->times += m_entry->times;
 		}
 	}
 }
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index de0e1bf..9408c4b 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2052,7 +2052,7 @@ pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
 
 		values[2] = Int64GetDatum(entry->calls);
 
-		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+		values[3] = UInt64GetDatum(entry->times);
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index e5dbcb4..962144d 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -432,7 +432,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 /* ----------
@@ -1270,7 +1270,7 @@ typedef struct PgStat_FunctionCallUsage
 } PgStat_FunctionCallUsage;
 
 extern WAHash *wa_hash;
-extern instr_time waitStart;
+extern uint64 waitStart;
 
 /* ----------
  * GUC parameters
@@ -1393,7 +1393,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -1401,15 +1401,15 @@ static inline void
 pgstat_report_waitaccum_end(uint32 wait_event_info)
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return;
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, wait_event_info);
@@ -1424,7 +1424,7 @@ pgstat_report_waitaccum_end(uint32 wait_event_info)
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index d645932..e3929c3 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -57,6 +57,10 @@
 
 #ifndef WIN32
 
+#if defined(__x86_64__) || defined(__i386__)
+#include <x86intrin.h>
+#endif
+
 #ifdef HAVE_CLOCK_GETTIME
 
 /* Use clock_gettime() */
@@ -209,6 +213,8 @@ typedef struct timeval instr_time;
 
 #else							/* WIN32 */
 
+#include <intrin.h>
+
 /* Use QueryPerformanceCounter() */
 
 typedef LARGE_INTEGER instr_time;
@@ -254,3 +260,18 @@ GetTimerFrequency(void)
 	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
 
 #endif							/* INSTR_TIME_H */
+
+#ifndef RDTSC_H_
+#define RDTSC_H_
+
+static inline uint64 rdtsc() {
+	uint64 result;
+#if defined(__x86_64__) || defined(__i386__) || defined(WIN32)
+	result = __rdtsc();
+#else
+	result = 0;
+#endif
+	return result;
+}
+
+#endif
-- 
1.8.3.1

#54王胜利
attic79@126.com
In reply to: imai.yoshikazu@fujitsu.com (#53)
Re: [Proposal] Add accumulated statistics for wait event

Thank you for this update! I will try it.

Best regards,
Victor wang

| |
王胜利
|
|
邮箱:attic79@126.com
|

签名由 网易邮箱大师 定制

On 02/25/2020 15:53, imai.yoshikazu@fujitsu.com wrote:
On Fri, Feb 14, 2020 at 11:59 AM, 王胜利 wrote:

I am glad to know you are working on PG accumulated statistics feature, and I am interested on it.
I see these two patch file you made, can you let me know which branch of PG code based?

when I use this: https://github.com/postgres/postgres/commits/master, and apply these patches, report some error.

Thanks for Wang's mail, I noticed my 0002 patch was wrong from v3.

Here, I attach correct patches.

Also I will begin to do some benchmark with higher scale and higher number of
users and try to change stats reporting implementation to not affect
performance, which I couldn't have not started because of another tasks.

--
Yoshikazu Imai

#55Kyotaro Horiguchi
horikyota.ntt@gmail.com
In reply to: imai.yoshikazu@fujitsu.com (#53)
Re: [Proposal] Add accumulated statistics for wait event

Hello. I had a brief look on this and have some comments on this.

At Tue, 25 Feb 2020 07:53:26 +0000, "imai.yoshikazu@fujitsu.com" <imai.yoshikazu@fujitsu.com> wrote in

Thanks for Wang's mail, I noticed my 0002 patch was wrong from v3.

Here, I attach correct patches.

Also I will begin to do some benchmark with higher scale and higher number of
users and try to change stats reporting implementation to not affect
performance, which I couldn't have not started because of another tasks.

It uses its own hash implement. Aside from the appropriateness of
having another implement of existing tool, in the first place hash
works well for wide, sparse and uncertain set of keys. But they are in
rather a dense and narrow set with certain and fixed members. It
registers more than 200 entries but bucket size is 461. It would be
needed to avoid colliisions, but seems a bit too wasting.

It seems trying to avoid doing needless work when the feature is not
activated by checking "if (wa_hash==NULL)", but the hash is created
being filled with possible entries regardless of whether
track_wait_timing is on or off. As the result
pgstat_report_waitaccum_end calls pgstat_get_wa_entry tremendously
frequently. This should be the reason for the recent benchmark result.
I'm not sure such frequency of hash-searching is acceptable even if
the feature is turned on.

I think we need a smarter mapping scheme of events to entries.

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

#56imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: Kyotaro Horiguchi (#55)
RE: [Proposal] Add accumulated statistics for wait event

On Wed, Feb 26, 2020 at 1:39 AM, Kyotaro Horiguchi wrote:

Hello. I had a brief look on this and have some comments on this.

Hi, Horiguchi-san. Thank you for looking at this!

It uses its own hash implement. Aside from the appropriateness of
having another implement of existing tool, in the first place hash
works well for wide, sparse and uncertain set of keys. But they are in
rather a dense and narrow set with certain and fixed members. It
registers more than 200 entries but bucket size is 461. It would be
needed to avoid colliisions, but seems a bit too wasting.

Yes, wait events are grouped and wait events ID are defined as a sequential
number, starting from specified number for each group(like 0x01000000U), thus
keys are in a dense and narrow set.

=====
#define PG_WAIT_LWLOCK 0x01000000U
#define PG_WAIT_LOCK 0x03000000U
#define PG_WAIT_BUFFER_PIN 0x04000000U
#define PG_WAIT_ACTIVITY 0x05000000U
...

typedef enum
{
WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
WAIT_EVENT_AUTOVACUUM_MAIN,
...
WAIT_EVENT_WAL_WRITER_MAIN
} WaitEventActivity;
=====

The number 461 is the lowest number which avoids collisions in the hash for
current wait events set. As you pointed out, there are a bit too much unused
entries.

If we prepare arrays for each wait classes with appropriate size which just
fits the number of each wait event class, we can store wait event statistics
into those arrays with no more wastes.
Also we calculate hash number by "(wait event ID) % (bucket size)" in hash
case, while we calculate index of arrays by "(wait event ID) - (wait event class id)"
in array case. The latter one's cost is lower than the former one.

Currently I implement wait event statistics store by hash because of its
easiness of implementation, but I think it is good to implement it by array
for above reasons. One concern is that we put restrictions on wait events
that it must be defined as it is sequenced in its wait class so that there
are no unused entries in the array.

It seems trying to avoid doing needless work when the feature is not
activated by checking "if (wa_hash==NULL)", but the hash is created
being filled with possible entries regardless of whether
track_wait_timing is on or off.

This might be bad implementation but there are the cases "wa_hash = NULL"
where pgstat_init() is not called like when executing "initdb". I insert
that check for avoiding unexpected crash in those cases.

I also noticed debug codes exist around that code...I will modify it.

As the result
pgstat_report_waitaccum_end calls pgstat_get_wa_entry tremendously
frequently. This should be the reason for the recent benchmark result.
I'm not sure such frequency of hash-searching is acceptable even if
the feature is turned on.

track_wait_timing parameter determines whether we collect wait time.
Regardless of that parameter, we always collect wait count every wait event
happens. I think calling pgstat_get_wa_entry frequently is not critical to
performance. From pavel's benchmark results, if track_wait_timing parameter
is off, there are +/-1.0% performance difference between patched and unpatched
and it is just considered as a measurement error.

Thanks
--
Yoshikazu Imai

#57Atsushi Torikoshi
atorik@gmail.com
In reply to: imai.yoshikazu@fujitsu.com (#56)
Re: [Proposal] Add accumulated statistics for wait event

Hi Imai-san,

I feel your 'pg_stat_waitaccum' will help us investigate the bottleneck.
So I'd like to do some benchmarks but unfortunately, the latest v6 patch
couldn't be applied to HEAD anymore.

Is it possible to share the latest patches?
If not, I'll make v6 applicable to the HEAD.

Regards,

--
Atsushi Torikoshi

On Fri, Feb 28, 2020 at 5:17 PM imai.yoshikazu@fujitsu.com <
imai.yoshikazu@fujitsu.com> wrote:

Show quoted text

On Wed, Feb 26, 2020 at 1:39 AM, Kyotaro Horiguchi wrote:

Hello. I had a brief look on this and have some comments on this.

Hi, Horiguchi-san. Thank you for looking at this!

It uses its own hash implement. Aside from the appropriateness of
having another implement of existing tool, in the first place hash
works well for wide, sparse and uncertain set of keys. But they are in
rather a dense and narrow set with certain and fixed members. It
registers more than 200 entries but bucket size is 461. It would be
needed to avoid colliisions, but seems a bit too wasting.

Yes, wait events are grouped and wait events ID are defined as a sequential
number, starting from specified number for each group(like 0x01000000U),
thus
keys are in a dense and narrow set.

=====
#define PG_WAIT_LWLOCK 0x01000000U
#define PG_WAIT_LOCK 0x03000000U
#define PG_WAIT_BUFFER_PIN 0x04000000U
#define PG_WAIT_ACTIVITY 0x05000000U
...

typedef enum
{
WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
WAIT_EVENT_AUTOVACUUM_MAIN,
...
WAIT_EVENT_WAL_WRITER_MAIN
} WaitEventActivity;
=====

The number 461 is the lowest number which avoids collisions in the hash for
current wait events set. As you pointed out, there are a bit too much
unused
entries.

If we prepare arrays for each wait classes with appropriate size which just
fits the number of each wait event class, we can store wait event
statistics
into those arrays with no more wastes.
Also we calculate hash number by "(wait event ID) % (bucket size)" in hash
case, while we calculate index of arrays by "(wait event ID) - (wait event
class id)"
in array case. The latter one's cost is lower than the former one.

Currently I implement wait event statistics store by hash because of its
easiness of implementation, but I think it is good to implement it by array
for above reasons. One concern is that we put restrictions on wait events
that it must be defined as it is sequenced in its wait class so that there
are no unused entries in the array.

It seems trying to avoid doing needless work when the feature is not
activated by checking "if (wa_hash==NULL)", but the hash is created
being filled with possible entries regardless of whether
track_wait_timing is on or off.

This might be bad implementation but there are the cases "wa_hash = NULL"
where pgstat_init() is not called like when executing "initdb". I insert
that check for avoiding unexpected crash in those cases.

I also noticed debug codes exist around that code...I will modify it.

As the result
pgstat_report_waitaccum_end calls pgstat_get_wa_entry tremendously
frequently. This should be the reason for the recent benchmark result.
I'm not sure such frequency of hash-searching is acceptable even if
the feature is turned on.

track_wait_timing parameter determines whether we collect wait time.
Regardless of that parameter, we always collect wait count every wait event
happens. I think calling pgstat_get_wa_entry frequently is not critical to
performance. From pavel's benchmark results, if track_wait_timing parameter
is off, there are +/-1.0% performance difference between patched and
unpatched
and it is just considered as a measurement error.

Thanks
--
Yoshikazu Imai

#58Daniel Gustafsson
daniel@yesql.se
In reply to: imai.yoshikazu@fujitsu.com (#56)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

This patch fails to apply to HEAD, please submit a rebased version. I've
marked this as as Waiting on Author.

cheers ./daniel

#59imai.yoshikazu@fujitsu.com
imai.yoshikazu@fujitsu.com
In reply to: Daniel Gustafsson (#58)
RE: [Proposal] Add accumulated statistics for wait event

This patch fails to apply to HEAD, please submit a rebased version. I've
marked this as as Waiting on Author.

Sorry for my absence. Unfortunately I couldn't have time to work on this patch in this cf.
I believe I will be back in next cf, work on this patch and also review other patches.

---
Yoshikazu IMAI

#60Daniel Gustafsson
daniel@yesql.se
In reply to: imai.yoshikazu@fujitsu.com (#59)
Re: [Proposal] Add accumulated statistics for wait event

On 31 Jul 2020, at 07:23, imai.yoshikazu@fujitsu.com wrote:

This patch fails to apply to HEAD, please submit a rebased version. I've
marked this as as Waiting on Author.

Sorry for my absence. Unfortunately I couldn't have time to work on this patch in this cf.
I believe I will be back in next cf, work on this patch and also review other patches.

No worries, it happens. Since the thread has stalled and there is no updated
patch I've marked this entry Returned with Feedback. Please feel free to
re-open a new CF entry if you return to this patch.

cheers ./daniel

In reply to: Daniel Gustafsson (#60)
3 attachment(s)
Re: [Proposal] Add accumulated statistics for wait event

Hi All,

I faced a few times a situation where a long running query is actually
including the time the backend is waiting for the frontend to fetch all the
rows (see [1]Last time I had such situation was few weeks ago. A customer was reporting a query being randomly slow, running bellow 100ms most of the time and sometime hitting 28s. Long story short, the number of row was the same (10-15k), but the result set size was 9x bigger (1MB vs 9MB). As the same query was running fine from psql, I suspected the frontend was somehow saturated. Tcpdump helped me to compute that the throughput fall to 256kB/s after the first 2MB of data transfert with a very narrow TCP window. I explained to the customer their app probably doesn't pull the rows fast enough and that some buffers were probably saturated on the frontend side, waiting for the app and slowing down the whole transfert. Devels fixed the problem by moving away two fields transformations (unaccent) from their loop fetching the rows. for details). See a sample code fe-time.c and its comments in
attachment to reproduce this behavior.

There's no simple way today to pinpoint the problem in production without
advance interactive auditing, and/or using system tools. After studying the
problem, I believe it boil down to track the wait event ClientWrite, so I ended
up on this thread.

You might catch some mismatching times thanks to auto_explain as well. Using
the fe-time.c demo code with the following command:

PGDATABASE=postgres PGHOST=::1 time ./fe-time 100

The frontend time is 10s, the query time reported is 3228.631ms, but last row
has been produced after 20.672ms:

LOG: duration: 3228.631 ms plan:
Query Text: SELECT * FROM pgbench_accounts
Seq Scan on pgbench_accounts (time=0.005..20.672 rows=100000 loops=1)

(Note that in contrast with localhost, through the unix socket, the backend
reported query time is always really close to 10s).

I re-based the existing patch (see in attachment), to look at the ClientWrite
for this exact query:

# SELECT wait_event, calls, times
FROM pg_stat_get_waitaccum(NULL)
WHERE wait_event = 'ClientWrite';

wait_event | calls | times
-------------+-------+---------
ClientWrite | 4 | 3132266

The "time" is expressed as µs in the patch, so 3132.266ms of the total
3228.631ms is spent sending the result to the frontend. I'm not sure where are
the missing 75ms.

The pg_wait_sampling extension might help but it requires a production restart
to install, then enable it. Whatever if the solution is sampling or cumulative,
an in-core and hot-switchable solution would be much more convenient. But
anyway, looking at pg_wait_sampling, we have a clear suspect as well for the
later query run:

# SELECT event, count
FROM pg_wait_sampling_profile
WHERE queryid=4045741516911800313;

event | count
-------------+-------
ClientWrite | 309

The default profil period of pg_wait_sampling being 10ms, we can roughly
estimate the ClientWrite around 3090ms. Note that this is close enough because
we know 3132266µs has been accumulated among only 4 large wait events.

Finishing bellow.

On Mon, 3 Aug 2020 00:00:40 +0200
Daniel Gustafsson <daniel@yesql.se> wrote:

On 31 Jul 2020, at 07:23, imai.yoshikazu@fujitsu.com wrote:

This patch fails to apply to HEAD, please submit a rebased version. I've
marked this as as Waiting on Author.

Please, find in attachment a rebase of both patches. I did some small editing
on the way. I didn't bench them.

I'm not sure this patch is the best approach though. Receive it as a
motivation to keep up with this discussion. As I wrote, whatever if the
solution is sampling or cumulative, an in-core and hot-switchable solution
would be much more convenient. The fact is that this patch was already
available and ready to keep up with a discussion.

Collecting and summing all wait events from all backends in the same place
forbid to track precisely wait events from a specific backends. Especially on a
busy system where numbers can quickly be buried by all other activities around.

I wonder if wait events should only be accumulated on backend side, making
possible to enable/disable them on the fly and to collect some reports eg. in
logs or to output. Most of the code from these patch could be recycled in a
simpler patch implementing this.

Thoughts?

Sorry for my absence. Unfortunately I couldn't have time to work on this
patch in this cf. I believe I will be back in next cf, work on this patch
and also review other patches.

No worries, it happens. Since the thread has stalled and there is no updated
patch I've marked this entry Returned with Feedback. Please feel free to
re-open a new CF entry if you return to this patch.

I volunteer to be a reviewer on this patch.

Imai-san, do you agree to add it as new CF entry?

Regards,

[1]: Last time I had such situation was few weeks ago. A customer was reporting a query being randomly slow, running bellow 100ms most of the time and sometime hitting 28s. Long story short, the number of row was the same (10-15k), but the result set size was 9x bigger (1MB vs 9MB). As the same query was running fine from psql, I suspected the frontend was somehow saturated. Tcpdump helped me to compute that the throughput fall to 256kB/s after the first 2MB of data transfert with a very narrow TCP window. I explained to the customer their app probably doesn't pull the rows fast enough and that some buffers were probably saturated on the frontend side, waiting for the app and slowing down the whole transfert. Devels fixed the problem by moving away two fields transformations (unaccent) from their loop fetching the rows.
reporting a query being randomly slow, running bellow 100ms most of the time
and sometime hitting 28s. Long story short, the number of row was the same
(10-15k), but the result set size was 9x bigger (1MB vs 9MB). As the same query
was running fine from psql, I suspected the frontend was somehow saturated.
Tcpdump helped me to compute that the throughput fall to 256kB/s after the
first 2MB of data transfert with a very narrow TCP window. I explained to the
customer their app probably doesn't pull the rows fast enough and that
some buffers were probably saturated on the frontend side, waiting for
the app and slowing down the whole transfert.
Devels fixed the problem by moving away two fields transformations (unaccent)
from their loop fetching the rows.

Attachments:

0001-Add-pg_stat_waitaccum-view-v7.patchtext/x-patchDownload
From 88c2779679c5c9625ca5348eec0543daab5ccab4 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Tue, 1 Jun 2021 13:25:57 +0200
Subject: [PATCH 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

On windows, this function might be not worked correctly, because
now it initializes local variables in pg_stat_init which is not
passed to fork processes on windows.
---
 src/backend/postmaster/pgstat.c               | 305 +++++++++++++++++-
 src/backend/storage/lmgr/lwlock.c             |  19 ++
 src/backend/utils/activity/wait_event.c       |  97 +++++-
 src/backend/utils/adt/pgstatfuncs.c           |  80 +++++
 src/backend/utils/misc/guc.c                  |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/catalog/pg_proc.dat               |   9 +
 src/include/pgstat.h                          |  68 +++-
 src/include/storage/lwlock.h                  |   1 +
 src/include/storage/proc.h                    |   1 +
 src/include/utils/wait_event.h                |  57 +---
 src/test/regress/expected/rules.out           |   5 +
 12 files changed, 599 insertions(+), 54 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b0d07c0e0b..b4a84a2f62 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -114,6 +114,7 @@
  */
 bool		pgstat_track_counts = false;
 int			pgstat_track_functions = TRACK_FUNC_OFF;
+bool		pgstat_track_wait_timing = false;
 
 /* ----------
  * Built from GUC parameter
@@ -177,6 +178,10 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+WAHash *wa_hash;
+
+instr_time waitStart;
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -276,6 +281,7 @@ static HTAB *pgStatDBHash = NULL;
  */
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
+static PgStat_WaitAccumStats waitAccumStats;
 static PgStat_WalStats walStats;
 static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS];
 static HTAB *replSlotStatHash = NULL;
@@ -305,6 +311,9 @@ static pid_t pgstat_forkexec(void);
 
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
 
+static void pgstat_init_waitaccum_hash(WAHash **hash);
+static PgStat_WaitAccumEntry *pgstat_add_wa_entry(WAHash *hash, uint32 key);
+
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
 static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 												 Oid tableoid, bool create);
@@ -348,6 +357,7 @@ static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_anl_ancestors(PgStat_MsgAnlAncestors *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len);
 static void pgstat_recv_wal(PgStat_MsgWal *msg, int len);
 static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
@@ -359,6 +369,25 @@ static void pgstat_recv_connstat(PgStat_MsgConn *msg, int len);
 static void pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
+PgStat_WaitAccumEntry *
+pgstat_get_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *current;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	current = hash->buckets[bucket];
+
+	while (current != NULL)
+	{
+		if (current->key == key)
+			return current->entry;
+
+		current = current->next;
+	}
+
+	return NULL;
+}
+
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -635,6 +664,8 @@ retry2:
 	/* Now that we have a long-lived socket, tell fd.c about it. */
 	ReserveExternalFD();
 
+	pgstat_init_waitaccum_hash(&wa_hash);
+
 	return;
 
 startup_failed:
@@ -657,6 +688,75 @@ startup_failed:
 	SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
+static PgStat_WaitAccumEntry *
+pgstat_add_wa_entry(WAHash *hash, uint32 key)
+{
+	WAEntry *prev;
+	WAEntry *new;
+	int bucket = key % WA_BUCKET_SIZE;
+
+	prev = hash->buckets[bucket];
+
+	while (prev != NULL && prev->next != NULL)
+		prev = prev->next;
+
+	new = &hash->entries[hash->entry_num++];
+	new->key = key;
+	new->entry = MemoryContextAllocZero(TopMemoryContext, (sizeof(PgStat_WaitAccumEntry)));
+
+	if (prev != NULL)
+		prev->next = new;
+	else
+		hash->buckets[bucket] = new;
+
+	return new->entry;
+}
+
+static void
+pgstat_init_waitaccum_entry(WAHash *hash, uint32 wait_event_info)
+{
+	PgStat_WaitAccumEntry *entry;
+
+	entry = pgstat_add_wa_entry(hash, wait_event_info);
+	entry->wait_event_info = wait_event_info;
+}
+
+static void
+pgstat_init_waitaccum_hash(WAHash **hash)
+{
+	uint32 i;
+	int last_tranche_id;
+
+	*hash = MemoryContextAllocZero(TopMemoryContext, sizeof(WAHash));
+
+	last_tranche_id = LWLockGetLastTrancheId();
+	for (i = PG_WAIT_LWLOCK + 1; i <= (PG_WAIT_LWLOCK | last_tranche_id); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = (PG_WAIT_LOCK | LOCKTAG_RELATION); i <= (PG_WAIT_LOCK | LOCKTAG_LAST_TYPE); i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_BUFFER_PIN; i <= PG_WAIT_BUFFER_PIN; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_ACTIVITY; i <= PG_WAIT_ACTIVITY_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_CLIENT; i <= PG_WAIT_CLIENT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IPC; i <= PG_WAIT_IPC_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_TIMEOUT; i <= PG_WAIT_TIMEOUT_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	for (i = PG_WAIT_IO; i <= PG_WAIT_IO_LAST_TYPE; i++)
+		pgstat_init_waitaccum_entry(*hash, i);
+
+	// FIXME: support extensions stuff
+}
+
 /*
  * subroutine for pgstat_reset_all
  */
@@ -960,8 +1060,11 @@ pgstat_report_stat(bool disconnect)
 	/* Send WAL statistics */
 	pgstat_send_wal(true);
 
-	/* Finally send SLRU statistics */
+	/* Send SLRU statistics */
 	pgstat_send_slru();
+
+	/* Finally send wait accumulative statistics */
+	pgstat_send_waitaccum();
 }
 
 /*
@@ -1453,6 +1556,8 @@ pgstat_reset_shared_counters(const char *target)
 		msg.m_resettarget = RESET_BGWRITER;
 	else if (strcmp(target, "wal") == 0)
 		msg.m_resettarget = RESET_WAL;
+	else if (strcmp(target, "waitaccum") == 0)
+		msg.m_resettarget = RESET_WAITACCUM;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -2895,6 +3000,22 @@ pgstat_fetch_replslot(NameData slotname)
 	return pgstat_get_replslot_entry(slotname, false);
 }
 
+/*
+ * ---------
+ * pgstat_fetch_stat_waitaccum() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	a pointer to the wait accum statistics struct.
+ * ---------
+ */
+PgStat_WaitAccumStats *
+pgstat_fetch_stat_waitaccum(void)
+{
+	backend_read_statsfile();
+
+	return &waitAccumStats;
+}
+
 /*
  * Shut down a single backend's statistics reporting at process exit.
  *
@@ -3172,6 +3293,52 @@ pgstat_send_slru(void)
 	}
 }
 
+/* ----------
+ * pgstat_send_waitaccum() -
+ *
+ * ----------
+ */
+void
+pgstat_send_waitaccum()
+{
+	PgStat_MsgWaitAccum msg;
+	PgStat_WaitAccumEntry *entry;
+	int i;
+
+	if (wa_hash == NULL)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_WAITACCUM);
+	msg.m_nentries = 0;
+
+	for (i = 0; i < wa_hash->entry_num; i++)
+	{
+		entry = wa_hash->entries[i].entry;
+
+		/* Send only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/*
+		 * Prepare and send the message
+		 */
+		memcpy(&msg.m_entry[msg.m_nentries], entry, sizeof(PgStat_WaitAccumEntry));
+		if (++msg.m_nentries >= PGSTAT_NUM_WAITACCUMENTRIES)
+		{
+			pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+						msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+			msg.m_nentries = 0;
+		}
+
+		/* Clear wait events information. */
+		entry->calls = 0;
+		INSTR_TIME_SET_ZERO(entry->times);
+	}
+
+	if (msg.m_nentries > 0)
+		pgstat_send(&msg, offsetof(PgStat_MsgWaitAccum, m_entry[0]) +
+					msg.m_nentries * sizeof(PgStat_WaitAccumEntry));
+}
 
 /* ----------
  * PgstatCollectorMain() -
@@ -3390,6 +3557,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_slru(&msg.msg_slru, len);
 					break;
 
+				case PGSTAT_MTYPE_WAITACCUM:
+					pgstat_recv_waitaccum(&msg.msg_waitaccum, len);
+					break;
+
 				case PGSTAT_MTYPE_FUNCSTAT:
 					pgstat_recv_funcstat(&msg.msg_funcstat, len);
 					break;
@@ -3605,7 +3776,6 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
 	return result;
 }
 
-
 /* ----------
  * pgstat_write_statsfiles() -
  *		Write the global statistics file, as well as requested DB files.
@@ -3629,7 +3799,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	int32		format_id;
 	const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-	int			rc;
+	int			rc, i;
 
 	elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
@@ -3725,6 +3895,23 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 		}
 	}
 
+	/*
+	 * Walk through the waitaccum hash.
+	 */
+	for (i = 0; i < waitAccumStats.hash->entry_num; i++)
+	{
+		PgStat_WaitAccumEntry *entry = waitAccumStats.hash->entries[i].entry;
+
+		/* Write only wait events that have occurred. */
+		if (entry->calls == 0)
+			continue;
+
+		/* Write out the wait event entry */
+		fputc('W', fpout);
+		rc = fwrite(entry, sizeof(PgStat_WaitAccumEntry), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
+	}
+
 	/*
 	 * No more output to be done. Close the temp file and replace the old
 	 * pgstat.stat with it.  The ferror() check replaces testing for error
@@ -3957,6 +4144,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	memset(&archiverStats, 0, sizeof(archiverStats));
 	memset(&walStats, 0, sizeof(walStats));
 	memset(&slruStats, 0, sizeof(slruStats));
+	waitAccumStats.hash = MemoryContextAllocZero(pgStatLocalContext,
+												 sizeof(WAHash));
 
 	/*
 	 * Set the current timestamp (will be kept only in case we can't load an
@@ -4184,6 +4373,44 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 					break;
 				}
 
+				/*
+				 * 'W'	A PgStat_WaitAccumEntry struct describing a Wait
+				 * event accumulator follows.
+				 */
+			case 'W':
+				{
+					PgStat_WaitAccumEntry *entry;
+					PgStat_WaitAccumEntry buf;
+					WAHash *hash = waitAccumStats.hash;
+
+					if (fread(&buf, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+										!= sizeof(PgStat_WaitAccumEntry))
+					{
+						ereport(pgStatRunningInCollector ? LOG : WARNING,
+								(errmsg("corrupted statistics file \"%s\"",
+										statfile)));
+						goto done;
+					}
+
+					entry = pgstat_get_wa_entry(hash, buf.wait_event_info);
+
+					if (entry)
+					{
+						ereport(pgStatRunningInCollector ? LOG : WARNING,
+								(errmsg("corrupted statistics file \"%s\"",
+										statfile)));
+						goto done;
+					}
+
+					/*
+					 * Add to the wait event hash
+					 */
+					entry = pgstat_add_wa_entry(hash, buf.wait_event_info);
+					memcpy(entry, &buf, sizeof(PgStat_WaitAccumEntry));
+
+					break;
+				}
+
 			case 'E':
 				goto done;
 
@@ -4396,6 +4623,7 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 	PgStat_WalStats myWalStats;
 	PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS];
 	PgStat_StatReplSlotEntry myReplSlotStats;
+	PgStat_WaitAccumEntry myWaitAccumStats;
 	FILE	   *fpin;
 	int32		format_id;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
@@ -4526,6 +4754,26 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
 				}
 				break;
 
+				/*
+				 * 'W'	A PgStat_WaitAccumEntry struct describing a Wait
+				 * event accumulator follows.
+				 */
+			case 'W':
+				{
+					if (fread(&myWaitAccumStats, 1, sizeof(PgStat_WaitAccumEntry), fpin)
+										!= sizeof(PgStat_WaitAccumEntry))
+					{
+						ereport(pgStatRunningInCollector ? LOG : WARNING,
+								(errmsg("corrupted statistics file \"%s\"",
+										statfile)));
+						FreeFile(fpin);
+						return false;
+					}
+
+					break;
+				}
+
+
 			case 'E':
 				goto done;
 
@@ -5071,6 +5319,20 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 		memset(&walStats, 0, sizeof(walStats));
 		walStats.stat_reset_timestamp = GetCurrentTimestamp();
 	}
+	else if (msg->m_resettarget == RESET_WAITACCUM)
+	{
+		PgStat_WaitAccumEntry *entry;
+		WAHash *hash = waitAccumStats.hash;
+		int i;
+
+		for (i = 0; i < hash->entry_num; i++)
+		{
+			entry = hash->entries[i].entry;
+
+			entry->calls = 0;
+			INSTR_TIME_SET_ZERO(entry->times);
+		}
+	}
 
 	/*
 	 * Presumably the sender of this message validated the target, don't
@@ -5394,6 +5656,43 @@ pgstat_recv_slru(PgStat_MsgSLRU *msg, int len)
 	slruStats[msg->m_index].truncate += msg->m_truncate;
 }
 
+/* ----------
+ * pgstat_recv_waitaccum() -
+ *
+ *	Process a WAITACCUM message.
+ * ----------
+ */
+static void
+pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
+{
+	PgStat_WaitAccumEntry *m_entry = &(msg->m_entry[0]);
+	PgStat_WaitAccumEntry *entry;
+	WAHash *hash = waitAccumStats.hash;
+	int			i;
+
+	/*
+	 * Process all function entries in the message.
+	 */
+	for (i = 0; i < msg->m_nentries; i++, m_entry++)
+	{
+		entry = pgstat_get_wa_entry(hash, m_entry->wait_event_info);
+
+		if (!entry)
+		{
+			entry = pgstat_add_wa_entry(hash, m_entry->wait_event_info);
+			memcpy(entry, m_entry, sizeof(PgStat_WaitAccumEntry));
+		}
+		else
+		{
+			/*
+			 * Otherwise add the values to the existing entry.
+			 */
+			entry->calls += m_entry->calls;
+			INSTR_TIME_ADD(entry->times, m_entry->times);
+		}
+	}
+}
+
 /* ----------
  * pgstat_recv_recoveryconflict() -
  *
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 55b9d7970e..e9a120805b 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -634,6 +634,25 @@ LWLockNewTrancheId(void)
 	return result;
 }
 
+/*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
 /*
  * Register a dynamic tranche name in the lookup table of the current process.
  *
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 6baf67740c..392a783fae 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -25,6 +25,7 @@
 #include "storage/lmgr.h"		/* for GetLockNameFromTagType */
 #include "storage/lwlock.h"		/* for GetLWLockIdentifier */
 #include "utils/wait_event.h"
+#include "pgstat.h"
 
 
 static const char *pgstat_get_wait_activity(WaitEventActivity w);
@@ -32,7 +33,8 @@ static const char *pgstat_get_wait_client(WaitEventClient w);
 static const char *pgstat_get_wait_ipc(WaitEventIPC w);
 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
 static const char *pgstat_get_wait_io(WaitEventIO w);
-
+static inline void pgstat_report_waitaccum_start();
+static inline void pgstat_report_waitaccum_end();
 
 static uint32 local_my_wait_event_info;
 uint32	   *my_wait_event_info = &local_my_wait_event_info;
@@ -732,3 +734,96 @@ pgstat_get_wait_io(WaitEventIO w)
 
 	return event_name;
 }
+
+
+static inline void
+pgstat_report_waitaccum_start()
+{
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(waitStart);
+	}
+}
+
+static inline void
+pgstat_report_waitaccum_end()
+{
+	PgStat_WaitAccumEntry *entry;
+	instr_time  diff;
+
+	if (wa_hash == NULL)
+		return;
+
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_SET_CURRENT(diff);
+		INSTR_TIME_SUBTRACT(diff, waitStart);
+	}
+
+	entry = pgstat_get_wa_entry(wa_hash, *my_wait_event_info);
+
+	if (!entry)
+	{
+		return;
+	}
+
+	entry->calls++;
+	if (pgstat_track_wait_timing)
+	{
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+}
+
+
+/* ----------
+ * pgstat_report_wait_start() -
+ *
+ *	Called from places where server process needs to wait.  This is called
+ *	to report wait event information.  The wait information is stored
+ *	as 4-bytes where first byte represents the wait event class (type of
+ *	wait, for different types of wait, refer WaitClass) and the next
+ *	3-bytes represent the actual wait event.  Currently 2-bytes are used
+ *	for wait event which is sufficient for current usage, 1-byte is
+ *	reserved for future usage.
+ *
+ *	Historically we used to make this reporting conditional on
+ *	pgstat_track_activities, but the check for that seems to add more cost
+ *	than it saves.
+ *
+ *	my_wait_event_info initially points to local memory, making it safe to
+ *	call this before MyProc has been initialized.
+ * ----------
+ */
+inline void
+pgstat_report_wait_start(uint32 wait_event_info)
+{
+	/*
+	 * Since this is a four-byte field which is always read and written as
+	 * four-bytes, updates are atomic.
+	 */
+	*(volatile uint32 *) my_wait_event_info = wait_event_info;
+
+	//FIXME: recent patch to speed up this call.
+	pgstat_report_waitaccum_start();
+}
+
+/* ----------
+ * pgstat_report_wait_end() -
+ *
+ *	Called to report end of a wait.
+ * ----------
+ */
+inline void
+pgstat_report_wait_end(void)
+{
+	//FIXME: recent patch to speed up this call.
+	pgstat_report_waitaccum_end();
+
+	/* see pgstat_report_wait_start() */
+	*(volatile uint32 *) my_wait_event_info = 0;
+
+
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 14056f5347..56afd20f22 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2380,3 +2380,83 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS)
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	PgStat_WaitAccumStats *waitaccum_stats;
+	PgStat_WaitAccumEntry *entry;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Get statistics about the waitaccum process */
+	waitaccum_stats = pgstat_fetch_stat_waitaccum();
+
+	for (i = 0; i < waitaccum_stats->hash->entry_num; i++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		entry = waitaccum_stats->hash->entries[i].entry;
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 68b62d523d..3260ab77d7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1546,6 +1546,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"track_wait_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for wait events."),
+			NULL
+		},
+		&pgstat_track_wait_timing,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
 			gettext_noop("Updates the process title to show the active SQL command."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ddbb6dc2be..f6d0a5af8d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -599,6 +599,7 @@
 #track_counts = on
 #track_io_timing = off
 #track_wal_io_timing = off
+#track_wait_timing = off
 #track_functions = none			# none, pl, all
 #stats_temp_directory = 'pg_stat_tmp'
 
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index acbcae4607..6c22bddc8e 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5284,6 +5284,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid,query_id}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '8316',
+  descr => 'statistics: information about accumulative data of wait event',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,text,text,int8,int8}',
+  proargmodes => '{i,o,o,o,o}',
+  proargnames => '{pid,wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 9612c0a6c2..5564907df8 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -74,6 +74,7 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_BGWRITER,
 	PGSTAT_MTYPE_WAL,
 	PGSTAT_MTYPE_SLRU,
+	PGSTAT_MTYPE_WAITACCUM,
 	PGSTAT_MTYPE_FUNCSTAT,
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
@@ -137,14 +138,15 @@ typedef enum PgStat_Shared_Reset_Target
 {
 	RESET_ARCHIVER,
 	RESET_BGWRITER,
-	RESET_WAL
+	RESET_WAL,
+	RESET_WAITACCUM
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
 typedef enum PgStat_Single_Reset_Type
 {
 	RESET_TABLE,
-	RESET_FUNCTION
+	RESET_FUNCTION,
 } PgStat_Single_Reset_Type;
 
 /* ------------------------------------------------------------
@@ -539,6 +541,48 @@ typedef struct PgStat_MsgReplSlot
 	PgStat_Counter m_total_bytes;
 } PgStat_MsgReplSlot;
 
+/* ----------
+ * PgStat_WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct PgStat_WaitAccumEntry
+{
+	uint32			wait_event_info;
+	PgStat_Counter	calls;
+	instr_time		times;
+} PgStat_WaitAccumEntry;
+
+typedef struct WAEntry
+{
+	int key;
+	PgStat_WaitAccumEntry *entry;
+	struct WAEntry *next;
+} WAEntry;
+
+#define WA_BUCKET_SIZE 461
+
+typedef struct WAHash
+{
+	WAEntry entries[WA_BUCKET_SIZE];
+	WAEntry *buckets[WA_BUCKET_SIZE];
+	int entry_num;
+} WAHash;
+
+/* ----------
+ * PgStat_MsgWaitAccum	Sent by backend/background's process to update statistics.
+ * ----------
+ */
+#define PGSTAT_NUM_WAITACCUMENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(int))  \
+	 / sizeof(PgStat_WaitAccumEntry))
+
+typedef struct PgStat_MsgWaitAccum
+{
+	PgStat_MsgHdr m_hdr;
+
+	int m_nentries;
+	PgStat_WaitAccumEntry m_entry[PGSTAT_NUM_WAITACCUMENTRIES];
+} PgStat_MsgWaitAccum;
 
 /* ----------
  * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
@@ -676,7 +720,6 @@ typedef struct PgStat_MsgConn
 	SessionEndType m_disconnect;
 } PgStat_MsgConn;
 
-
 /* ----------
  * PgStat_Msg					Union over all possible messages.
  * ----------
@@ -702,6 +745,7 @@ typedef union PgStat_Msg
 	PgStat_MsgBgWriter msg_bgwriter;
 	PgStat_MsgWal msg_wal;
 	PgStat_MsgSLRU msg_slru;
+	PgStat_MsgWaitAccum msg_waitaccum;
 	PgStat_MsgFuncstat msg_funcstat;
 	PgStat_MsgFuncpurge msg_funcpurge;
 	PgStat_MsgRecoveryConflict msg_recoveryconflict;
@@ -721,7 +765,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BCA2
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BCA3
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -909,6 +953,15 @@ typedef struct PgStat_StatReplSlotEntry
 } PgStat_StatReplSlotEntry;
 
 
+/*
+ * WaitAccum statistics kept in the stats collector
+ */
+typedef struct PgStat_WaitAccumStats
+{
+	WAHash *hash;
+} PgStat_WaitAccumStats;
+
+
 /*
  * Working state needed to accumulate per-function-call timing statistics.
  */
@@ -925,6 +978,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+extern instr_time waitStart;
+extern WAHash *wa_hash;
 
 /* ----------
  * GUC parameters
@@ -932,6 +987,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern PGDLLIMPORT bool pgstat_track_counts;
 extern PGDLLIMPORT int pgstat_track_functions;
+extern PGDLLIMPORT bool pgstat_track_wait_timing;
 extern char *pgstat_stat_directory;
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;
@@ -1092,6 +1148,7 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
 extern void pgstat_send_wal(bool force);
+extern void pgstat_send_waitaccum(void);
 
 /* ----------
  * Support functions for the SQL-callable functions to
@@ -1106,6 +1163,7 @@ extern PgStat_GlobalStats *pgstat_fetch_global(void);
 extern PgStat_WalStats *pgstat_fetch_stat_wal(void);
 extern PgStat_SLRUStats *pgstat_fetch_slru(void);
 extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname);
+extern PgStat_WaitAccumStats *pgstat_fetch_stat_waitaccum(void);
 
 extern void pgstat_count_slru_page_zeroed(int slru_idx);
 extern void pgstat_count_slru_page_hit(int slru_idx);
@@ -1117,4 +1175,6 @@ extern void pgstat_count_slru_truncate(int slru_idx);
 extern const char *pgstat_slru_name(int slru_idx);
 extern int	pgstat_slru_index(const char *name);
 
+extern PgStat_WaitAccumEntry *pgstat_get_wa_entry(WAHash *hash, uint32 key);
+
 #endif							/* PGSTAT_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a8f052e484..e6638b2c45 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -154,6 +154,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index be67d8a861..db09b3b64a 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,6 +21,7 @@
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 #include "storage/proclist_types.h"
+#include "portability/instr_time.h"
 
 /*
  * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 6c6ec2e711..caa0db7f79 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -50,6 +50,8 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+
 /* ----------
  * Wait Events - Client
  *
@@ -70,6 +72,8 @@ typedef enum
 	WAIT_EVENT_WAL_SENDER_WRITE_DATA,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_WAL_SENDER_WRITE_DATA
+
 /* ----------
  * Wait Events - IPC
  *
@@ -128,6 +132,8 @@ typedef enum
 	WAIT_EVENT_XACT_GROUP_UPDATE
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_XACT_GROUP_UPDATE
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -143,6 +149,8 @@ typedef enum
 	WAIT_EVENT_VACUUM_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_VACUUM_DELAY
+
 /* ----------
  * Wait Events - IO
  *
@@ -227,58 +235,15 @@ typedef enum
 	WAIT_EVENT_LOGICAL_SUBXACT_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_LOGICAL_SUBXACT_WRITE
 
 extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
-static inline void pgstat_report_wait_start(uint32 wait_event_info);
-static inline void pgstat_report_wait_end(void);
+void pgstat_report_wait_start(uint32 wait_event_info);
+void pgstat_report_wait_end(void);
 extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
 extern void pgstat_reset_wait_event_storage(void);
 
 extern PGDLLIMPORT uint32 *my_wait_event_info;
 
-
-/* ----------
- * pgstat_report_wait_start() -
- *
- *	Called from places where server process needs to wait.  This is called
- *	to report wait event information.  The wait information is stored
- *	as 4-bytes where first byte represents the wait event class (type of
- *	wait, for different types of wait, refer WaitClass) and the next
- *	3-bytes represent the actual wait event.  Currently 2-bytes are used
- *	for wait event which is sufficient for current usage, 1-byte is
- *	reserved for future usage.
- *
- *	Historically we used to make this reporting conditional on
- *	pgstat_track_activities, but the check for that seems to add more cost
- *	than it saves.
- *
- *	my_wait_event_info initially points to local memory, making it safe to
- *	call this before MyProc has been initialized.
- * ----------
- */
-static inline void
-pgstat_report_wait_start(uint32 wait_event_info)
-{
-	/*
-	 * Since this is a four-byte field which is always read and written as
-	 * four-bytes, updates are atomic.
-	 */
-	*(volatile uint32 *) my_wait_event_info = wait_event_info;
-}
-
-/* ----------
- * pgstat_report_wait_end() -
- *
- *	Called to report end of a wait.
- * ----------
- */
-static inline void
-pgstat_report_wait_end(void)
-{
-	/* see pgstat_report_wait_start() */
-	*(volatile uint32 *) my_wait_event_info = 0;
-}
-
-
 #endif							/* WAIT_EVENT_H */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index e5ab11275d..4fbcefd46f 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2183,6 +2183,11 @@ pg_stat_wal| SELECT w.wal_records,
     w.wal_sync_time,
     w.stats_reset
    FROM pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_sync, wal_write_time, wal_sync_time, stats_reset);
+pg_stat_waitaccum| SELECT s.wait_event_type,
+    s.wait_event,
+    s.calls,
+    s.times
+   FROM pg_stat_get_waitaccum(NULL::integer) s(wait_event_type, wait_event, calls, times);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
2.20.1

0002-POC-Change-measuring-method-of-wait-event-time-from-v7.patchtext/x-patchDownload
From ddb1adc5cd9acc9bc9de16d0cf057124b09fe1e3 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Fri, 4 Jun 2021 18:14:51 +0200
Subject: [PATCH 2/2] [POC] Change measuring method of wait event time from
 INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.
---
 src/backend/postmaster/pgstat.c         |  8 ++++----
 src/backend/utils/activity/wait_event.c | 10 +++++-----
 src/backend/utils/adt/pgstatfuncs.c     |  2 +-
 src/include/pgstat.h                    |  4 ++--
 src/include/portability/instr_time.h    | 21 +++++++++++++++++++++
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b4a84a2f62..e928239a29 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -180,7 +180,7 @@ static bool pgStatRunningInCollector = false;
 
 WAHash *wa_hash;
 
-instr_time waitStart;
+uint64 waitStart;
 
 /*
  * Structures in which backends store per-table info that's waiting to be
@@ -3332,7 +3332,7 @@ pgstat_send_waitaccum()
 
 		/* Clear wait events information. */
 		entry->calls = 0;
-		INSTR_TIME_SET_ZERO(entry->times);
+		entry->times = 0;
 	}
 
 	if (msg.m_nentries > 0)
@@ -5330,7 +5330,7 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 			entry = hash->entries[i].entry;
 
 			entry->calls = 0;
-			INSTR_TIME_SET_ZERO(entry->times);
+			entry->times = 0;
 		}
 	}
 
@@ -5688,7 +5688,7 @@ pgstat_recv_waitaccum(PgStat_MsgWaitAccum *msg, int len)
 			 * Otherwise add the values to the existing entry.
 			 */
 			entry->calls += m_entry->calls;
-			INSTR_TIME_ADD(entry->times, m_entry->times);
+			entry->times += m_entry->times;
 		}
 	}
 }
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 392a783fae..6d47eb0028 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -744,7 +744,7 @@ pgstat_report_waitaccum_start()
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(waitStart);
+		waitStart = rdtsc();
 	}
 }
 
@@ -752,15 +752,15 @@ static inline void
 pgstat_report_waitaccum_end()
 {
 	PgStat_WaitAccumEntry *entry;
-	instr_time  diff;
+	uint64		diff = 0;
 
 	if (wa_hash == NULL)
 		return;
 
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_SET_CURRENT(diff);
-		INSTR_TIME_SUBTRACT(diff, waitStart);
+		diff = rdtsc();
+		diff -= waitStart;
 	}
 
 	entry = pgstat_get_wa_entry(wa_hash, *my_wait_event_info);
@@ -773,7 +773,7 @@ pgstat_report_waitaccum_end()
 	entry->calls++;
 	if (pgstat_track_wait_timing)
 	{
-		INSTR_TIME_ADD(entry->times, diff);
+		entry->times += diff;
 	}
 }
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 56afd20f22..6faedd4938 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2450,7 +2450,7 @@ pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
 
 		values[2] = Int64GetDatum(entry->calls);
 
-		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+		values[3] = UInt64GetDatum(entry->times);
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 	}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 5564907df8..9a9ac8e16d 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -549,7 +549,7 @@ typedef struct PgStat_WaitAccumEntry
 {
 	uint32			wait_event_info;
 	PgStat_Counter	calls;
-	instr_time		times;
+	uint64			times;
 } PgStat_WaitAccumEntry;
 
 typedef struct WAEntry
@@ -978,7 +978,7 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
-extern instr_time waitStart;
+extern uint64 waitStart;
 extern WAHash *wa_hash;
 
 /* ----------
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 39a4f0600e..507a1ca44d 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -57,6 +57,10 @@
 
 #ifndef WIN32
 
+#if defined(__x86_64__) || defined(__i386__)
+#include <x86intrin.h>
+#endif
+
 #ifdef HAVE_CLOCK_GETTIME
 
 /* Use clock_gettime() */
@@ -209,6 +213,8 @@ typedef struct timeval instr_time;
 
 #else							/* WIN32 */
 
+#include <intrin.h>
+
 /* Use QueryPerformanceCounter() */
 
 typedef LARGE_INTEGER instr_time;
@@ -254,3 +260,18 @@ GetTimerFrequency(void)
 	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
 
 #endif							/* INSTR_TIME_H */
+
+#ifndef RDTSC_H_
+#define RDTSC_H_
+
+static inline uint64 rdtsc() {
+	uint64 result;
+#if defined(__x86_64__) || defined(__i386__) || defined(WIN32)
+	result = __rdtsc();
+#else
+	result = 0;
+#endif
+	return result;
+}
+
+#endif
-- 
2.20.1

fe-time.ctext/x-c++srcDownload
#62Andres Freund
andres@anarazel.de
In reply to: Jehan-Guillaume de Rorthais (#61)
Re: [Proposal] Add accumulated statistics for wait event

HHi,

On 2021-06-05 00:53:44 +0200, Jehan-Guillaume de Rorthais wrote:

From 88c2779679c5c9625ca5348eec0543daab5ccab4 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Tue, 1 Jun 2021 13:25:57 +0200
Subject: [PATCH 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

I object to adding this overhead. The whole selling point for wait
events was that they are low overhead. I since spent time reducing the
overhead further, because even just the branches for checking if
track_activity is enabled are measurable (225a22b19ed).

From ddb1adc5cd9acc9bc9de16d0cf057124b09fe1e3 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Fri, 4 Jun 2021 18:14:51 +0200
Subject: [PATCH 2/2] [POC] Change measuring method of wait event time from
INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME (which
uses gettimeofday or clock_gettime) to rdtsc. This might reduce the overhead
of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.

rdtsc is a serializing (*) instruction - that's the expensive part. On linux
clock_gettime() doesn't actually need a syscall. While the vdso call
implies a bit of overhead over a raw rdtsc, it's a relatively small part
of the overhead. See
/messages/by-id/20200612232810.f46nbqkdhbutzqdg@alap3.anarazel.de

Greetings,

Andres Freund

(*) it's not fully serializing, iirc it allows later instructions to be
started, but it does wait for all earlier in-flight instructions to
finish.

In reply to: Andres Freund (#62)
1 attachment(s)
Re: [Proposal] Add accumulated statistics for wait event

Hi Andres, Hi all,

First, thank you for your feedback!

Please find in attachment a patch implementing accumulated wait event stats
only from the backend point of view. As I wrote when I reviewed and rebased the
existing patch, I was uncomfortable with the global approach. I still volunteer
to work/review the original approach is required.

See bellow for comments and some more explanations about what I think might be
improvements over the previous patch.

On Fri, 11 Jun 2021 12:18:07 -0700
Andres Freund <andres@anarazel.de> wrote:

On 2021-06-05 00:53:44 +0200, Jehan-Guillaume de Rorthais wrote:

From 88c2779679c5c9625ca5348eec0543daab5ccab4 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Tue, 1 Jun 2021 13:25:57 +0200
Subject: [PATCH 1/2] Add pg_stat_waitaccum view.

pg_stat_waitaccum shows counts and duration of each wait events.
Each backend/backgrounds counts and measures the time of wait event
in every pgstat_report_wait_start and pgstat_report_wait_end. They
store those info into their local variables and send to Statistics
Collector. We can get those info via Statistics Collector.

For reducing overhead, I implemented statistic hash instead of
dynamic hash. I also implemented track_wait_timing which
determines wait event duration is collected or not.

I object to adding this overhead. The whole selling point for wait
events was that they are low overhead. I since spent time reducing the
overhead further, because even just the branches for checking if
track_activity is enabled are measurable (225a22b19ed).

Agree. The previous patch I rebased was to review it and reopen this discussion,
I even added a small FIXME in pgstat_report_wait_end and
pgstat_report_wait_start about your work:

//FIXME: recent patch to speed up this call.

In the patch in attachment, I tried to fix this by using kind of an internal
hook for pgstat_report_wait_start and pgstat_report_wait_end. This allows to
"instrument" wait events only when required, on the fly, dynamically.

Moreover, I removed the hash structure for a simple static array for faster
access.

From ddb1adc5cd9acc9bc9de16d0cf057124b09fe1e3 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Fri, 4 Jun 2021 18:14:51 +0200
Subject: [PATCH 2/2] [POC] Change measuring method of wait event time from
INSTR_TIME to rdtsc.

This patch changes measuring method of wait event time from INSTR_TIME
(which uses gettimeofday or clock_gettime) to rdtsc. This might reduce the
overhead of measuring overhead.

Any supports like changing clock cycle to actual time or error handling are
not currently implemented.

rdtsc is a serializing (*) instruction - that's the expensive part. On linux
clock_gettime() doesn't actually need a syscall. While the vdso call
implies a bit of overhead over a raw rdtsc, it's a relatively small part
of the overhead. See
/messages/by-id/20200612232810.f46nbqkdhbutzqdg@alap3.anarazel.de

I choose to remove all this rdtsc part from my patch as this wasn't clear how
much faster it was compare to simpler vdso functions and how to accurately
extract a human time.

About my take on $subject, for the sake of simplicity of this PoC, I added
instrumentation to log_statement_stats. Despite the query context of the
reported log, they are really accumulated stats.

The patch updated pg_stat_get_waitaccum() as well to be able to report the
accumulated wait events from your interactive or batch session.

So using my previous fe-time demo client, you can test it using:

PGOPTIONS="--log_statement_stats=on" ./fe-time 100

From logs, I now have (notice the last line):

LOG: duration: 3837.194 ms statement: SELECT * FROM pgbench_accounts
LOG: QUERY STATISTICS
DETAIL: ! system usage stats:
! 0.087444 s user, 0.002106 s system, 3.837202 s elapsed
! [0.087444 s user, 0.003974 s system total]
! 25860 kB max resident size
! 0/0 [0/0] filesystem blocks in/out
! 0/303 [0/697] page faults/reclaims, 0 [0] swaps
! 0 [0] signals rcvd, 0/0 [0/0] messages rcvd/sent
! 4/18 [5/18] voluntary/involuntary context switches
! Client/ClientWrite 4 calls, 3747102 us elapsed

Using pgbench scale factor 10, the copy query for pgbench_accounts looks like:

LOG: duration: 2388.081 ms statement: copy pgbench_accounts from stdin
LOG: QUERY STATISTICS
DETAIL: ! system usage stats:
! 1.373756 s user, 0.252860 s system, 2.388100 s elapsed
! [1.397015 s user, 0.264951 s system total]
! 37788 kB max resident size
! 0/641584 [0/642056] filesystem blocks in/out
! 194/4147 [195/4728] page faults/reclaims, 0 [0] swaps
! 0 [0] signals rcvd, 0/0 [0/0] messages rcvd/sent
! 3263/92 [3390/102] voluntary/involuntary context switches
! LWLock/WALBufMapping 3 calls, 654 us elapsed
! LWLock/WALWrite 3 calls, 60680 us elapsed
! LWLock/CheckpointerComm 3 calls, 318 us elapsed
! Client/ClientRead 1 calls, 151 us elapsed
! IO/DataFileExtend 16397 calls, 94000 us elapsed
! IO/DataFileWrite 14346 calls, 45911 us elapsed
! IO/WALInitSync 6 calls, 334840 us elapsed
! IO/WALInitWrite 6 calls, 48040 us elapsed
! IO/WALSync 17 calls, 353334 us elapsed
! IO/WALWrite 8362 calls, 38401 us elapsed

Now, I'm on a fence about the user facing interaction. I'll keep thinking about
this and report this week. In the meantime, any feedback about the current
implementation backbone is welcome.

Thanks!

Regards,

Attachments:

v1-0001-PoC-Trace-wait-events-to-logfile-when-log_stateme.patchtext/x-patchDownload
From e3d1f39138babfebd28a92f4dd8136eeb173d2c2 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Fri, 11 Jun 2021 18:17:47 +0200
Subject: [PATCH v1] PoC: Trace wait events to logfile when
 log_statement_stats=on

---
 src/backend/storage/lmgr/lwlock.c       |  19 +++
 src/backend/storage/lmgr/proc.c         |   2 +
 src/backend/tcop/postgres.c             |  48 ++++++-
 src/backend/utils/activity/wait_event.c | 158 ++++++++++++++++++++++++
 src/backend/utils/adt/pgstatfuncs.c     |  74 +++++++++++
 src/include/catalog/pg_proc.dat         |   9 ++
 src/include/storage/lock.h              |   1 +
 src/include/storage/lwlock.h            |   1 +
 src/include/utils/wait_event.h          |  77 ++++++++++++
 9 files changed, 387 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 55b9d7970e..e9a120805b 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -634,6 +634,25 @@ LWLockNewTrancheId(void)
 	return result;
 }
 
+/*
+ * Get a last tranche ID.
+ */
+int
+LWLockGetLastTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	Assert(!lock_named_request_allowed);
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = *LWLockCounter;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
 /*
  * Register a dynamic tranche name in the lookup table of the current process.
  *
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 2575ea1ca0..52891c26a9 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -450,6 +450,8 @@ InitProcess(void)
 
 	/* now that we have a proc, report wait events to shared memory */
 	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+	/* init wait event tracking structure*/
+	pgstat_init_waitaccums();
 
 	/*
 	 * We might be reusing a semaphore that belonged to a failed process. So
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 8cea10c901..25500e8f76 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -207,7 +207,6 @@ static void log_disconnections(int code, Datum arg);
 static void enable_statement_timeout(void);
 static void disable_statement_timeout(void);
 
-
 /* ----------------------------------------------------------------
  *		routines to obtain user input
  * ----------------------------------------------------------------
@@ -978,7 +977,10 @@ exec_simple_query(const char *query_string)
 	 * results because ResetUsage wasn't called.
 	 */
 	if (save_log_statement_stats)
+	{
 		ResetUsage();
+		pgstat_set_report_waits();
+	}
 
 	/*
 	 * Start up a transaction command.  All queries generated by the
@@ -1308,7 +1310,10 @@ exec_simple_query(const char *query_string)
 	}
 
 	if (save_log_statement_stats)
+	{
 		ShowUsage("QUERY STATISTICS");
+		pgstat_reset_report_waits();
+	}
 
 	TRACE_POSTGRESQL_QUERY_DONE(query_string);
 
@@ -1346,7 +1351,10 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	set_ps_display("PARSE");
 
 	if (save_log_statement_stats)
+	{
 		ResetUsage();
+		pgstat_set_report_waits();
+	}
 
 	ereport(DEBUG2,
 			(errmsg_internal("parse %s: %s",
@@ -1567,7 +1575,10 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	}
 
 	if (save_log_statement_stats)
+	{
 		ShowUsage("PARSE MESSAGE STATISTICS");
+		pgstat_reset_report_waits();
+	}
 
 	debug_query_string = NULL;
 }
@@ -1637,7 +1648,10 @@ exec_bind_message(StringInfo input_message)
 	set_ps_display("BIND");
 
 	if (save_log_statement_stats)
+	{
 		ResetUsage();
+		pgstat_set_report_waits();
+	}
 
 	/*
 	 * Start up a transaction command so we can call functions etc. (Note that
@@ -2029,7 +2043,10 @@ exec_bind_message(StringInfo input_message)
 	}
 
 	if (save_log_statement_stats)
+	{
 		ShowUsage("BIND MESSAGE STATISTICS");
+		pgstat_reset_report_waits();
+	}
 
 	debug_query_string = NULL;
 }
@@ -2122,7 +2139,10 @@ exec_execute_message(const char *portal_name, long max_rows)
 	set_ps_display(GetCommandTagName(portal->commandTag));
 
 	if (save_log_statement_stats)
+	{
 		ResetUsage();
+		pgstat_set_report_waits();
+	}
 
 	BeginCommand(portal->commandTag, dest);
 
@@ -2269,7 +2289,10 @@ exec_execute_message(const char *portal_name, long max_rows)
 	}
 
 	if (save_log_statement_stats)
+	{
 		ShowUsage("EXECUTE MESSAGE STATISTICS");
+		pgstat_reset_report_waits();
+	}
 
 	debug_query_string = NULL;
 }
@@ -3914,7 +3937,6 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx,
 #endif
 }
 
-
 /* ----------------------------------------------------------------
  * PostgresMain
  *	   postgres main loop -- all backends, interactive or otherwise start here
@@ -4795,10 +4817,13 @@ void
 ShowUsage(const char *title)
 {
 	StringInfoData str;
+	WaitAccumEntry *entry = wa_events.events;
 	struct timeval user,
 				sys;
 	struct timeval elapse_t;
 	struct rusage r;
+	bool		save_log_statement_stats = log_statement_stats;
+	int		i;
 
 	getrusage(RUSAGE_SELF, &r);
 	gettimeofday(&elapse_t, NULL);
@@ -4879,6 +4904,25 @@ ShowUsage(const char *title)
 					 r.ru_nvcsw, r.ru_nivcsw);
 #endif							/* HAVE_GETRUSAGE */
 
+	if (save_log_statement_stats)
+	{
+		for (i = 0; i < wa_events.num_events; i++, entry++)
+		{
+			const char *wait_event_type = NULL;
+			const char *wait_event = NULL;
+
+			if (entry->calls < 1)
+				continue;
+
+			wait_event_type = pgstat_get_wait_event_type(entry->wait_event_info);
+			wait_event = pgstat_get_wait_event(entry->wait_event_info);
+
+			appendStringInfo(&str, "!\t%s/%s %lu calls, %lu us elapsed\n",
+					 wait_event_type, wait_event, entry->calls,
+					 INSTR_TIME_GET_MICROSEC(entry->times));
+		}
+	}
+
 	/* remove trailing newline */
 	if (str.data[str.len - 1] == '\n')
 		str.data[--str.len] = '\0';
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 6baf67740c..61dd125341 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -32,11 +32,127 @@ static const char *pgstat_get_wait_client(WaitEventClient w);
 static const char *pgstat_get_wait_ipc(WaitEventIPC w);
 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
 static const char *pgstat_get_wait_io(WaitEventIO w);
+static inline void accum_pgstat_report_wait_start();
+static inline void accum_pgstat_report_wait_end();
 
 
 static uint32 local_my_wait_event_info;
 uint32	   *my_wait_event_info = &local_my_wait_event_info;
 
+WaitEvents wa_events;
+instr_time waitEventStart;
+report_wait_start_function	my_pgstat_report_wait_start = standard_pgstat_report_wait_start;
+report_wait_end_function	my_pgstat_report_wait_end = standard_pgstat_report_wait_end;
+
+static WaitAccumEntry *
+pgstat_get_wait_entry(uint32 wait_event_info)
+{
+	uint32		classId;
+	uint16		eventId;
+	WaitAccumEntry *entry = NULL;
+
+	/* report process as not waiting. */
+	if (wait_event_info == 0)
+		return NULL;
+
+	classId = wait_event_info & 0xFF000000;
+	eventId = wait_event_info & 0x0000FFFF;
+
+	switch (classId)
+	{
+		case PG_WAIT_LWLOCK:
+			entry = wa_events.wa_lwlock + eventId;
+			break;
+		case PG_WAIT_LOCK:
+			entry = wa_events.wa_lock + eventId;
+			break;
+		case PG_WAIT_BUFFER_PIN:
+			entry = wa_events.wa_buffer_pin + eventId;
+			break;
+		case PG_WAIT_ACTIVITY:
+			entry = wa_events.wa_activity + eventId;
+			break;
+		case PG_WAIT_CLIENT:
+			entry = wa_events.wa_client + eventId;
+			break;
+		// FIXME
+		//  case PG_WAIT_EXTENSION:
+		//  	break;
+		case PG_WAIT_IPC:
+			entry = wa_events.wa_ipc + eventId;
+			break;
+		case PG_WAIT_TIMEOUT:
+			entry = wa_events.wa_timeout + eventId;
+			break;
+		case PG_WAIT_IO:
+			entry = wa_events.wa_io + eventId;
+			break;
+		default:
+			break;
+	}
+
+	return entry;
+}
+
+void
+pgstat_init_waitaccums(void)
+{
+	WaitAccumEntry *event;
+	int num_lwlocks;
+	int i;
+
+	num_lwlocks = LWLockGetLastTrancheId() +1;
+	wa_events.num_events = num_lwlocks
+		 + PG_WAIT_ACTIVITY_NUM
+		 + PG_WAIT_CLIENT_NUM
+		 + PG_WAIT_IPC_NUM
+		 + PG_WAIT_TIMEOUT_NUM
+		 + PG_WAIT_IO_NUM
+		 + LOCKTAG_NUM
+		 + PG_WAIT_BUFFER_PIN_NUM;
+
+	wa_events.events = MemoryContextAllocZero(TopMemoryContext,
+						  wa_events.num_events
+						  * sizeof(WaitAccumEntry));
+
+	event = wa_events.events;
+
+	// FIXME: support extensions
+
+	wa_events.wa_lwlock = event;
+	for (i = 0; i < num_lwlocks; i++, event++)
+		event->wait_event_info = PG_WAIT_LWLOCK|i;
+
+	wa_events.wa_lock = event;
+	for (i = 0; i < LOCKTAG_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_LOCK|i;
+
+	wa_events.wa_buffer_pin = event;
+	for (i = 0; i < PG_WAIT_BUFFER_PIN_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_BUFFER_PIN|i;
+
+	wa_events.wa_activity = event;
+	for (i = 0; i < PG_WAIT_ACTIVITY_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_ACTIVITY|i;
+
+	wa_events.wa_client = event;
+	for (i = 0; i < PG_WAIT_CLIENT_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_CLIENT|i;
+
+	wa_events.wa_ipc = event;
+	for (i = 0; i < PG_WAIT_IPC_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_IPC|i;
+
+	wa_events.wa_timeout = event;
+	for (i = 0; i < PG_WAIT_TIMEOUT_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_TIMEOUT|i;
+
+	wa_events.wa_io = event;
+	for (i = 0; i < PG_WAIT_IO_NUM; i++, event++)
+		event->wait_event_info = PG_WAIT_IO|i;
+
+	pgstat_reset_report_waits();
+}
 
 /*
  * Configure wait event reporting to report wait events to *wait_event_info.
@@ -732,3 +848,45 @@ pgstat_get_wait_io(WaitEventIO w)
 
 	return event_name;
 }
+
+static inline void
+accum_pgstat_report_wait_start(uint32 wait_event_info)
+{
+	standard_pgstat_report_wait_start(wait_event_info);
+
+	INSTR_TIME_SET_CURRENT(waitEventStart);
+}
+
+static inline void
+accum_pgstat_report_wait_end(void)
+{
+	WaitAccumEntry *entry;
+	instr_time  diff;
+
+	INSTR_TIME_SET_CURRENT(diff);
+	INSTR_TIME_SUBTRACT(diff, waitEventStart);
+
+	entry = pgstat_get_wait_entry(*my_wait_event_info);
+
+	if (entry)
+	{
+		entry->calls++;
+		INSTR_TIME_ADD(entry->times, diff);
+	}
+
+	standard_pgstat_report_wait_end();
+}
+
+void
+pgstat_set_report_waits(void)
+{
+	my_pgstat_report_wait_start = accum_pgstat_report_wait_start;
+	my_pgstat_report_wait_end = accum_pgstat_report_wait_end;
+}
+
+void
+pgstat_reset_report_waits(void)
+{
+	my_pgstat_report_wait_start = standard_pgstat_report_wait_start;
+	my_pgstat_report_wait_end = standard_pgstat_report_wait_end;
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 14056f5347..cf4db57bff 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -2380,3 +2380,77 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS)
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 }
+
+Datum
+pg_stat_get_waitaccum(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_WAITACCUM_COLS	4
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	WaitAccumEntry *entry = wa_events.events;
+	int i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	for (i = 0; i < wa_events.num_events; i++, entry++)
+	{
+		Datum		values[PG_STAT_GET_WAITACCUM_COLS];
+		bool		nulls[PG_STAT_GET_WAITACCUM_COLS];
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+
+		/* Initialise values and NULL flags arrays */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		/* Fill values and NULLs */
+		{
+			uint32		raw_wait_event;
+
+			raw_wait_event = UINT32_ACCESS_ONCE(entry->wait_event_info);
+			wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+			wait_event = pgstat_get_wait_event(raw_wait_event);
+		}
+
+		values[0] = CStringGetTextDatum(wait_event_type);
+
+		values[1] = CStringGetTextDatum(wait_event);
+
+		values[2] = Int64GetDatum(entry->calls);
+
+		values[3] = UInt64GetDatum(INSTR_TIME_GET_MICROSEC(entry->times));
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index acbcae4607..b066863a97 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5284,6 +5284,15 @@
   proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid,query_id}',
   prosrc => 'pg_stat_get_activity' },
+{ oid => '8316',
+  descr => 'statistics: accumulated wait event stats',
+  proname => 'pg_stat_get_waitaccum', prorows => '200', proisstrict => 'f',
+  proretset => 't', provolatile => 'v', proparallel => 'r',
+  prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,text,int8,int8}',
+  proargmodes => '{o,o,o,o}',
+  proargnames => '{wait_event_type,wait_event,calls,times}',
+  prosrc => 'pg_stat_get_waitaccum' },
 { oid => '3318',
   descr => 'statistics: information about progress of backends running maintenance command',
   proname => 'pg_stat_get_progress_info', prorows => '100', proretset => 't',
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index 9b2a421c32..885664fa54 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -152,6 +152,7 @@ typedef enum LockTagType
 } LockTagType;
 
 #define LOCKTAG_LAST_TYPE	LOCKTAG_ADVISORY
+#define	LOCKTAG_NUM 		LOCKTAG_LAST_TYPE
 
 extern const char *const LockTagTypeNames[];
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a8f052e484..e6638b2c45 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -154,6 +154,7 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
  * registration in the main shared memory segment wouldn't work for that case.
  */
 extern int	LWLockNewTrancheId(void);
+extern int	LWLockGetLastTrancheId(void);
 extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
 extern void LWLockInitialize(LWLock *lock, int tranche_id);
 
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 6c6ec2e711..9da4b714e2 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -10,6 +10,8 @@
 #ifndef WAIT_EVENT_H
 #define WAIT_EVENT_H
 
+#include "utils/memutils.h"
+#include "portability/instr_time.h"
 
 /* ----------
  * Wait Classes
@@ -25,6 +27,40 @@
 #define PG_WAIT_TIMEOUT				0x09000000U
 #define PG_WAIT_IO					0x0A000000U
 
+/* ----------
+ * WaitAccumEntry	Entry in backend/background's per-wait_event_info hash table
+ * ----------
+ */
+typedef struct WaitAccumEntry
+{
+	uint32		wait_event_info;
+	uint64		calls;
+	instr_time	times;
+} WaitAccumEntry;
+
+typedef struct WaitEvents
+{
+	uint32		num_events;
+	WaitAccumEntry *events;
+	WaitAccumEntry *wa_lwlock;
+	WaitAccumEntry *wa_lock;
+	WaitAccumEntry *wa_buffer_pin;
+	WaitAccumEntry *wa_activity;
+	WaitAccumEntry *wa_client;
+	// FIXME WaitAccumEntry *wa_extension;
+	WaitAccumEntry *wa_ipc;
+	WaitAccumEntry *wa_timeout;
+	WaitAccumEntry *wa_io;
+} WaitEvents;
+
+extern WaitEvents wa_events;
+
+/*
+ * There's only one entry in PG_WAIT_BUFFER_PIN class:
+ * PG_WAIT_BUFFER_PIN itself
+ */
+#define PG_WAIT_BUFFER_PIN_NUM 1
+
 /* ----------
  * Wait Events - Activity
  *
@@ -50,6 +86,9 @@ typedef enum
 	WAIT_EVENT_WAL_WRITER_MAIN
 } WaitEventActivity;
 
+#define	PG_WAIT_ACTIVITY_LAST_TYPE	WAIT_EVENT_WAL_WRITER_MAIN
+#define	PG_WAIT_ACTIVITY_NUM ( WAIT_EVENT_WAL_WRITER_MAIN - PG_WAIT_ACTIVITY )
+
 /* ----------
  * Wait Events - Client
  *
@@ -70,6 +109,9 @@ typedef enum
 	WAIT_EVENT_WAL_SENDER_WRITE_DATA,
 } WaitEventClient;
 
+#define	PG_WAIT_CLIENT_LAST_TYPE	WAIT_EVENT_WAL_SENDER_WRITE_DATA
+#define	PG_WAIT_CLIENT_NUM ( PG_WAIT_CLIENT_LAST_TYPE - PG_WAIT_CLIENT )
+
 /* ----------
  * Wait Events - IPC
  *
@@ -128,6 +170,9 @@ typedef enum
 	WAIT_EVENT_XACT_GROUP_UPDATE
 } WaitEventIPC;
 
+#define	PG_WAIT_IPC_LAST_TYPE	WAIT_EVENT_XACT_GROUP_UPDATE
+#define	PG_WAIT_IPC_NUM ( PG_WAIT_IPC_LAST_TYPE - PG_WAIT_IPC )
+
 /* ----------
  * Wait Events - Timeout
  *
@@ -143,6 +188,9 @@ typedef enum
 	WAIT_EVENT_VACUUM_DELAY
 } WaitEventTimeout;
 
+#define	PG_WAIT_TIMEOUT_LAST_TYPE	WAIT_EVENT_VACUUM_DELAY
+#define	PG_WAIT_TIMEOUT_NUM ( PG_WAIT_TIMEOUT_LAST_TYPE - PG_WAIT_TIMEOUT )
+
 /* ----------
  * Wait Events - IO
  *
@@ -227,14 +275,30 @@ typedef enum
 	WAIT_EVENT_LOGICAL_SUBXACT_WRITE
 } WaitEventIO;
 
+#define	PG_WAIT_IO_LAST_TYPE	WAIT_EVENT_LOGICAL_SUBXACT_WRITE
+#define	PG_WAIT_IO_NUM ( PG_WAIT_IO_LAST_TYPE - PG_WAIT_IO )
+
+/* track start time of current wait event */
+extern instr_time waitEventStart;
+
+void pgstat_init_waitaccums(void);
 
 extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
 static inline void pgstat_report_wait_start(uint32 wait_event_info);
+static inline void standard_pgstat_report_wait_start(uint32 wait_event_info);
 static inline void pgstat_report_wait_end(void);
+static inline void standard_pgstat_report_wait_end(void);
+extern void pgstat_set_report_waits(void);
+extern void pgstat_reset_report_waits(void);
 extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
 extern void pgstat_reset_wait_event_storage(void);
 
+typedef void (*report_wait_start_function) (uint32 wait_event_info);
+extern PGDLLIMPORT report_wait_start_function my_pgstat_report_wait_start;
+typedef void (*report_wait_end_function) ();
+extern PGDLLIMPORT report_wait_end_function my_pgstat_report_wait_end;
+
 extern PGDLLIMPORT uint32 *my_wait_event_info;
 
 
@@ -259,6 +323,12 @@ extern PGDLLIMPORT uint32 *my_wait_event_info;
  */
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
+{
+	(*my_pgstat_report_wait_start)(wait_event_info);
+}
+
+static inline void
+standard_pgstat_report_wait_start(uint32 wait_event_info)
 {
 	/*
 	 * Since this is a four-byte field which is always read and written as
@@ -275,6 +345,13 @@ pgstat_report_wait_start(uint32 wait_event_info)
  */
 static inline void
 pgstat_report_wait_end(void)
+{
+	(*my_pgstat_report_wait_end)();
+}
+
+
+static inline void
+standard_pgstat_report_wait_end(void)
 {
 	/* see pgstat_report_wait_start() */
 	*(volatile uint32 *) my_wait_event_info = 0;
-- 
2.20.1

#64Andres Freund
andres@anarazel.de
In reply to: Jehan-Guillaume de Rorthais (#63)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

On 2021-06-14 16:10:32 +0200, Jehan-Guillaume de Rorthais wrote:

In the patch in attachment, I tried to fix this by using kind of an internal
hook for pgstat_report_wait_start and pgstat_report_wait_end. This allows to
"instrument" wait events only when required, on the fly, dynamically.

That's *far worse*. You're adding an indirect function call. Which requires
loading a global variable and then a far call to a different function. You're
changing a path that's ~2 instructions with minimal dependencies (and no
branches (i.e. fully out of order executable) to something on the order of ~15
instructions with plenty dependencies and at least two branches (call, ret).

I doubt there's a path towards this feature without adding the necessary
infrastructure to hot-patch the code - which is obviously quite a
substantial project.

Greetings,

Andres Freund

#65Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#64)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

On 2021-06-14 11:27:21 -0700, Andres Freund wrote:

On 2021-06-14 16:10:32 +0200, Jehan-Guillaume de Rorthais wrote:

In the patch in attachment, I tried to fix this by using kind of an internal
hook for pgstat_report_wait_start and pgstat_report_wait_end. This allows to
"instrument" wait events only when required, on the fly, dynamically.

That's *far worse*. You're adding an indirect function call. Which requires
loading a global variable and then a far call to a different function. You're
changing a path that's ~2 instructions with minimal dependencies (and no
branches (i.e. fully out of order executable) to something on the order of ~15
instructions with plenty dependencies and at least two branches (call, ret).

In the case at hand it might even be worse, because the external function call
will require registers to be spilled for the function call. Right now wait
events "use" two register (one for the wait event, one for my_wait_event_info),
but otherwise don't cause additional spilling. With your change you'd see
register spill/reload around both wait start and end.

Greetings,

Andres Freund

In reply to: Andres Freund (#64)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

On Mon, 14 Jun 2021 11:27:21 -0700
Andres Freund <andres@anarazel.de> wrote:

On 2021-06-14 16:10:32 +0200, Jehan-Guillaume de Rorthais wrote:

In the patch in attachment, I tried to fix this by using kind of an internal
hook for pgstat_report_wait_start and pgstat_report_wait_end. This allows to
"instrument" wait events only when required, on the fly, dynamically.

That's *far worse*. You're adding an indirect function call. Which requires
loading a global variable and then a far call to a different function. You're
changing a path that's ~2 instructions with minimal dependencies (and no
branches (i.e. fully out of order executable) to something on the order of ~15
instructions with plenty dependencies and at least two branches (call, ret).

Oh, I didn't realized it would affect all queries, even when log_statement_stats
was off. Thank you for your explanation.

I doubt there's a path towards this feature without adding the necessary
infrastructure to hot-patch the code - which is obviously quite a
substantial project.

Right. Sadly, this kind of project is far above what I can do. So I suppose
it's a dead end for me.

I'll study if/how the sampling approach can be done dynamically.

Thank you,

#67Andres Freund
andres@anarazel.de
In reply to: Jehan-Guillaume de Rorthais (#66)
Re: [Proposal] Add accumulated statistics for wait event

Hi,

On 2021-06-14 23:20:47 +0200, Jehan-Guillaume de Rorthais wrote:

On 2021-06-14 16:10:32 +0200, Jehan-Guillaume de Rorthais wrote:

In the patch in attachment, I tried to fix this by using kind of an internal
hook for pgstat_report_wait_start and pgstat_report_wait_end. This allows to
"instrument" wait events only when required, on the fly, dynamically.

That's *far worse*. You're adding an indirect function call. Which requires
loading a global variable and then a far call to a different function. You're
changing a path that's ~2 instructions with minimal dependencies (and no
branches (i.e. fully out of order executable) to something on the order of ~15
instructions with plenty dependencies and at least two branches (call, ret).

Oh, I didn't realized it would affect all queries, even when log_statement_stats
was off. Thank you for your explanation.

Maybe I just am misunderstanding what you were doing? As far as I can
tell your patch changed pgstat_report_wait_start() to be an indirect
function call - right? Then yes, this adds overhead to everything.

You *could* add a pgstat_report_wait_(start|end)_with_time() or such and
only use that in places that won't have a high frequency. But I just
don't quite see the use-case for that.

Greetings,

Andres Freund

In reply to: Andres Freund (#67)
Re: [Proposal] Add accumulated statistics for wait event

Hi Andres,

On Mon, 14 Jun 2021 15:01:14 -0700
Andres Freund <andres@anarazel.de> wrote:

On 2021-06-14 23:20:47 +0200, Jehan-Guillaume de Rorthais wrote:

On 2021-06-14 16:10:32 +0200, Jehan-Guillaume de Rorthais wrote:

In the patch in attachment, I tried to fix this by using kind of an
internal hook for pgstat_report_wait_start and pgstat_report_wait_end.
This allows to "instrument" wait events only when required, on the fly,
dynamically.

That's *far worse*. You're adding an indirect function call. Which
requires loading a global variable and then a far call to a different
function. You're changing a path that's ~2 instructions with minimal
dependencies (and no branches (i.e. fully out of order executable) to
something on the order of ~15 instructions with plenty dependencies and
at least two branches (call, ret).

Oh, I didn't realized it would affect all queries, even when
log_statement_stats was off. Thank you for your explanation.

Maybe I just am misunderstanding what you were doing? As far as I can
tell your patch changed pgstat_report_wait_start() to be an indirect
function call - right?

Exact.

I didn't realized this indirection would be so costy on every single calls,
after the variable assignation itself.

Then yes, this adds overhead to everything.

I understand now, thank you for the explanation.
For my own curiosity and study, I'll remove this indirection and bench my patch
anyway.

You *could* add a pgstat_report_wait_(start|end)_with_time() or such and
only use that in places that won't have a high frequency. But I just
don't quite see the use-case for that.

Well, it could be useful if we decide to only track a subset of wait event.
In my scenario, I originally wanted to only track ClientWrite, but then I
realized this might be too specific and tried to generalize.

There are probably some other way to deal with this issue. Eg.:

* do NOT include the time lost waiting for the frontend side in the query
execution time
* expose the frontend part of the query time in log_min_duration_stmt,
auto_explain, pg_stat_statements, in the same fashion we currently do with
planning and execution time
* having some wait-even sampling mechanism in core or as easy and hot-loadable
than auto_explain

Thoughts?

Thanks again!

Regards,