Explain analyze getrusage tracking

Started by Greg Starkabout 15 years ago8 messages
#1Greg Stark
stark@mit.edu
1 attachment(s)

This is an update to my earlier patch to add getrusage resource
tracking to EXPLAIN ANALYZE.

With this patch you get something like:

QUERY PLAN
--------------------------------------------------------------------------------------------------------------
Seq Scan on i (cost=0.00..6919.44 rows=262144 width=101) (actual
time=17.240..1123.751 rows=262144 loops=1)
Resources: sys=210.000ms user=430.000ms read=33.6MB
Buffers: shared read=4298
Total runtime: 1548.651 ms
(4 rows)

The main change is to make it work under Windows. At least I think the
changes should make it work under Windows, I haven't been able to test
it. Actually I'm not to happy with the way I did it, I would be more
inclined to hack the getrusagestub,h definition of struct rusage to
have an instr_time in it so that we can use the same macros directly.
But that's more changes than I would be happy making without being
able to compile them to test them.

--
greg

Attachments:

explain-getrusage-v3.difftext/x-patch; charset=US-ASCII; name=explain-getrusage-v3.diffDownload
*** a/src/backend/commands/explain.c
--- b/src/backend/commands/explain.c
***************
*** 110,115 **** static void ExplainJSONLineEnding(ExplainState *es);
--- 110,116 ----
  static void ExplainYAMLLineStarting(ExplainState *es);
  static void escape_json(StringInfo buf, const char *str);
  static void escape_yaml(StringInfo buf, const char *str);
+ static double normalize_memory(double amount, char **unit, int *precision);
  
  
  
***************
*** 142,147 **** ExplainQuery(ExplainStmt *stmt, const char *queryString,
--- 143,150 ----
  			es.costs = defGetBoolean(opt);
  		else if (strcmp(opt->defname, "buffers") == 0)
  			es.buffers = defGetBoolean(opt);
+ 		else if (strcmp(opt->defname, "resource") == 0)
+ 			es.rusage = defGetBoolean(opt);
  		else if (strcmp(opt->defname, "format") == 0)
  		{
  			char	   *p = defGetString(opt);
***************
*** 368,373 **** ExplainOnePlan(PlannedStmt *plannedstmt, ExplainState *es,
--- 371,378 ----
  		instrument_option |= INSTRUMENT_TIMER;
  	if (es->buffers)
  		instrument_option |= INSTRUMENT_BUFFERS;
+ 	if (es->rusage)
+ 		instrument_option |= INSTRUMENT_RUSAGE;
  
  	/*
  	 * Use a snapshot with an updated command ID to ensure this query sees
***************
*** 1098,1103 **** ExplainNode(PlanState *planstate, List *ancestors,
--- 1103,1199 ----
  			break;
  	}
  
+ 	/* Show resource usage from getrusage */
+ 	if (es->rusage && es->format == EXPLAIN_FORMAT_TEXT)
+ 	{
+ 		const struct rusage *usage = &planstate->instrument->rusage;
+ 
+ 		bool has_rusage = (!INSTR_TIME_IS_ZERO_TV(usage->ru_stime) || 
+ 						   !INSTR_TIME_IS_ZERO_TV(usage->ru_utime) || 
+ 						   usage->ru_inblock > 0 || 
+ 						   usage->ru_oublock > 0);
+ 		bool has_verbose_rusage = (usage->ru_minflt   > 0 || 
+ 								   usage->ru_majflt   > 0 ||
+ 								   usage->ru_nswap    > 0 || 
+ 								   usage->ru_msgsnd   > 0 || 
+ 								   usage->ru_msgrcv   > 0 || 
+ 								   usage->ru_nsignals > 0 || 
+ 								   usage->ru_nvcsw    > 0 || 
+ 								   usage->ru_nivcsw   > 0);
+ 
+ 		if (has_rusage || (es->verbose && has_verbose_rusage))
+ 		{
+ 			appendStringInfoSpaces(es->str, es->indent *2);
+ 			appendStringInfoString(es->str, "Resources:");
+ 			
+ 			if (!INSTR_TIME_IS_ZERO_TV(usage->ru_stime)) 
+ 			{
+ 				double stime = INSTR_TIME_GET_DOUBLE_TV(usage->ru_stime);
+ 				appendStringInfo(es->str, " sys=%.3fms", stime * 1000);
+ 			}
+ 			
+ 			if (!INSTR_TIME_IS_ZERO_TV(usage->ru_utime)) 
+ 			{
+ 				double utime = INSTR_TIME_GET_DOUBLE_TV(usage->ru_utime);
+ 				appendStringInfo(es->str, " user=%.3fms", utime * 1000);
+ 			}
+ 			
+ 			if (usage->ru_inblock > 0)
+ 			{
+ 				double inblock;
+ 				char *units;
+ 				int prec;
+ 				inblock = normalize_memory((double)usage->ru_inblock * 512, &units, &prec);
+ 				appendStringInfo(es->str, " read=%.*f%s", prec, inblock, units);
+ 			}
+ 			if (usage->ru_oublock > 0)
+ 			{
+ 				double oublock;
+ 				char *units;
+ 				int prec;
+ 				oublock = normalize_memory((double)usage->ru_oublock * 512, &units, &prec);
+ 				appendStringInfo(es->str, " written=%.*f%s", prec, oublock, units);
+ 			}
+ 			if (es->verbose)
+ 			{
+ 				if (usage->ru_minflt > 0)
+ 					appendStringInfo(es->str, " minflt=%ld", usage->ru_minflt);
+ 				if (usage->ru_majflt > 0)
+ 					appendStringInfo(es->str, " majflt=%ld", usage->ru_majflt);
+ 				if (usage->ru_nswap > 0)
+ 					appendStringInfo(es->str, " nswap=%ld", usage->ru_nswap);
+ 				if (usage->ru_msgsnd > 0)
+ 					appendStringInfo(es->str, " msgsnd=%ld", usage->ru_msgsnd);
+ 				if (usage->ru_msgrcv > 0)
+ 					appendStringInfo(es->str, " msgrcv=%ld", usage->ru_msgrcv);
+ 				if (usage->ru_nsignals > 0)
+ 					appendStringInfo(es->str, " nsignals=%ld", usage->ru_nsignals);
+ 				if (usage->ru_nvcsw > 0)
+ 					appendStringInfo(es->str, " nvcsw=%ld", usage->ru_nvcsw);
+ 				if (usage->ru_nivcsw > 0)
+ 					appendStringInfo(es->str, " nivcsw=%ld", usage->ru_nivcsw);
+ 			}
+ 			appendStringInfoChar(es->str, '\n');
+ 		}
+ 	} 
+ 	else if (es->rusage) 
+ 	{
+ 		const struct rusage *usage = &planstate->instrument->rusage;
+ 
+ 		ExplainPropertyFloat("User Time", INSTR_TIME_GET_DOUBLE_TV(usage->ru_utime), 3, es);
+ 		ExplainPropertyFloat("System Time", INSTR_TIME_GET_DOUBLE_TV(usage->ru_stime), 3, es);
+ 		ExplainPropertyLong("Minor Page Faults", usage->ru_minflt, es);
+ 		ExplainPropertyLong("Major Page Faults", usage->ru_majflt, es);
+ 		ExplainPropertyLong("Swaps", usage->ru_nswap, es);
+ 		ExplainPropertyLong("Blocks Written", usage->ru_inblock, es);
+ 		ExplainPropertyLong("Blocks Read", usage->ru_oublock, es);
+ 		ExplainPropertyLong("Messages Sent", usage->ru_msgsnd, es);
+ 		ExplainPropertyLong("Messages Received", usage->ru_msgrcv, es);
+ 		ExplainPropertyLong("Signals Received", usage->ru_nsignals, es);
+ 		ExplainPropertyLong("Voluntary Context Switches", usage->ru_nvcsw, es);
+ 		ExplainPropertyLong("Involuntary Context Switches", usage->ru_nivcsw, es);
+ 	}	
+ 
  	/* Show buffer usage */
  	if (es->buffers)
  	{
***************
*** 2216,2218 **** escape_yaml(StringInfo buf, const char *str)
--- 2312,2347 ----
  {
  	escape_json(buf, str);
  }
+ 
+ /*
+  * For a quantity of bytes pick a reasonable display unit for it and
+  * return the quantity in that unit. Also return the unit name and a
+  * reasonable precision via the reference parameters.
+  */
+ 
+ static double normalize_memory(double amount, char **unit, int *precision)
+ {
+ 	static char *units[] = {"bytes", "kB", "MB", "GB", "TB", "PB"};
+ 	char **u = units, **last = units + (sizeof(units)/sizeof(*units)-1);
+ 
+ 	while (amount > 1024.0 && u < last)
+ 	{
+ 		amount /= 1024.0;
+ 		u += 1;
+ 	}
+ 
+ 	*unit = *u;
+ 
+ 	/* if it's bytes or kB then don't print decimals since that's less
+ 	 * than blocksize, otherwise always print 3 significant digits */
+ 	if (u == units || u == units+1 )
+ 		*precision = 0;
+ 	else if (amount < 10)
+ 		*precision = 2;
+ 	else if (amount < 100)
+ 		*precision = 1;
+ 	else
+ 		*precision = 0;
+ 
+ 	return amount;
+ }
*** a/src/backend/executor/instrument.c
--- b/src/backend/executor/instrument.c
***************
*** 21,26 **** BufferUsage pgBufferUsage;
--- 21,29 ----
  
  static void BufferUsageAccumDiff(BufferUsage *dst,
  					 const BufferUsage *add, const BufferUsage *sub);
+ static void ResourceUsageAccumDiff(struct rusage *dst,
+ 					   const struct rusage *add,
+ 					   const struct rusage *sub);
  
  /* Allocate new instrumentation structure(s) */
  Instrumentation *
***************
*** 40,45 **** InstrAlloc(int n, int instrument_options)
--- 43,56 ----
  			instr[i].needs_bufusage = true;
  	}
  
+ 	if (instrument_options & INSTRUMENT_RUSAGE)
+ 	{
+ 		int			i;
+ 
+ 		for (i = 0; i < n; i++)
+ 			instr[i].needs_rusage = true;
+ 	}
+ 
  	return instr;
  }
  
***************
*** 55,60 **** InstrStartNode(Instrumentation *instr)
--- 66,73 ----
  	/* initialize buffer usage per plan node */
  	if (instr->needs_bufusage)
  		instr->bufusage_start = pgBufferUsage;
+ 	if (instr->needs_rusage)
+ 		getrusage(RUSAGE_SELF, &instr->rusage_start);
  }
  
  /* Exit from a plan node */
***************
*** 82,87 **** InstrStopNode(Instrumentation *instr, double nTuples)
--- 95,118 ----
  		BufferUsageAccumDiff(&instr->bufusage,
  							 &pgBufferUsage, &instr->bufusage_start);
  
+ 	if (instr->needs_rusage)
+ 	{
+ 		struct rusage rusage;
+ 
+ 		if (INSTR_TIME_IS_ZERO_TV(instr->rusage_start.ru_utime) ||
+ 			INSTR_TIME_IS_ZERO_TV(instr->rusage_start.ru_stime))
+ 		{
+ 			elog(DEBUG2, "InstrStopNode called without rusage start");
+ 		}
+ 
+ 		getrusage(RUSAGE_SELF, &rusage);
+ 		ResourceUsageAccumDiff(&instr->rusage,
+ 							   &rusage,
+ 							   &instr->rusage_start);
+ 		INSTR_TIME_SET_ZERO_TV(instr->rusage_start.ru_utime);
+ 		INSTR_TIME_SET_ZERO_TV(instr->rusage_start.ru_stime);
+ 	}
+ 
  	/* Is this the first tuple of this cycle? */
  	if (!instr->running)
  	{
***************
*** 134,136 **** BufferUsageAccumDiff(BufferUsage *dst,
--- 165,186 ----
  	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
  	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
  }
+ 
+ static void
+ ResourceUsageAccumDiff(struct rusage *dst,
+ 					   const struct rusage *add,
+ 					   const struct rusage *sub)
+ {
+ 	INSTR_TIME_ACCUM_DIFF_TV(dst->ru_utime, add->ru_utime, sub->ru_utime);
+ 	INSTR_TIME_ACCUM_DIFF_TV(dst->ru_stime, add->ru_stime, sub->ru_stime);
+ 	dst->ru_minflt   += add->ru_minflt   - sub->ru_minflt;
+ 	dst->ru_majflt   += add->ru_majflt   - sub->ru_majflt;
+ 	dst->ru_nswap    += add->ru_nswap    - sub->ru_nswap;
+ 	dst->ru_inblock  += add->ru_inblock  - sub->ru_inblock;
+ 	dst->ru_oublock  += add->ru_oublock  - sub->ru_oublock;
+ 	dst->ru_msgsnd   += add->ru_msgsnd   - sub->ru_msgsnd;
+ 	dst->ru_msgrcv   += add->ru_msgrcv   - sub->ru_msgrcv;
+ 	dst->ru_nsignals += add->ru_nsignals - sub->ru_nsignals;
+ 	dst->ru_nvcsw    += add->ru_nvcsw    - sub->ru_nvcsw;
+ 	dst->ru_nivcsw   += add->ru_nivcsw   - sub->ru_nivcsw;
+ }
*** a/src/include/commands/explain.h
--- b/src/include/commands/explain.h
***************
*** 31,36 **** typedef struct ExplainState
--- 31,37 ----
  	bool		analyze;		/* print actual times */
  	bool		costs;			/* print costs */
  	bool		buffers;		/* print buffer usage */
+ 	bool		rusage;			/* print resource usage (from getrusage) */
  	ExplainFormat format;		/* output format */
  	/* other states */
  	PlannedStmt *pstmt;			/* top of plan */
*** a/src/include/executor/instrument.h
--- b/src/include/executor/instrument.h
***************
*** 15,20 ****
--- 15,22 ----
  
  #include "portability/instr_time.h"
  
+ #include <sys/time.h>
+ #include <sys/resource.h>
  
  typedef struct BufferUsage
  {
***************
*** 32,37 **** typedef enum InstrumentOption
--- 34,40 ----
  {
  	INSTRUMENT_TIMER = 1 << 0,	/* needs timer */
  	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
+ 	INSTRUMENT_RUSAGE = 1 << 2,	/* needs resource usage (getrusage) */
  	INSTRUMENT_ALL = 0x7FFFFFFF
  } InstrumentOption;
  
***************
*** 40,56 **** typedef struct Instrumentation
--- 43,62 ----
  	/* Info about current plan cycle: */
  	bool		running;		/* TRUE if we've completed first tuple */
  	bool		needs_bufusage; /* TRUE if we need buffer usage */
+ 	bool		needs_rusage;   /* TRUE if we need resource usage */
  	instr_time	starttime;		/* Start time of current iteration of node */
  	instr_time	counter;		/* Accumulated runtime for this node */
  	double		firsttuple;		/* Time for first tuple of this cycle */
  	double		tuplecount;		/* Tuples emitted so far this cycle */
  	BufferUsage bufusage_start; /* Buffer usage at start */
+ 	struct rusage rusage_start; /* Resource usage at start (from getrusage) */
  	/* Accumulated statistics across all completed cycles: */
  	double		startup;		/* Total startup time (in seconds) */
  	double		total;			/* Total total time (in seconds) */
  	double		ntuples;		/* Total tuples produced */
  	double		nloops;			/* # of run cycles for this node */
  	BufferUsage bufusage;		/* Total buffer usage */
+ 	struct rusage rusage;		/* Total resource usage (from getrusage) */ 
  } Instrumentation;
  
  extern PGDLLIMPORT BufferUsage pgBufferUsage;
*** a/src/include/portability/instr_time.h
--- b/src/include/portability/instr_time.h
***************
*** 9,14 ****
--- 9,18 ----
   * instead.  These macros also give some breathing room to use other
   * high-precision-timing APIs on yet other platforms.
   *
+  * It exports its macros for manipulating struct timeval with names
+  * ending in _TV. This is for the benefit of other routines that have
+  * to manipulate struct timevals (such as those in getrusage)
+ 
   * The basic data type is instr_time, which all callers should treat as an
   * opaque typedef.	instr_time can store either an absolute time (of
   * unspecified reference time) or an interval.	The operations provided
***************
*** 52,70 ****
  #ifndef INSTR_TIME_H
  #define INSTR_TIME_H
  
- #ifndef WIN32
- 
  #include <sys/time.h>
  
  typedef struct timeval instr_time;
  
! #define INSTR_TIME_IS_ZERO(t)	((t).tv_usec == 0 && (t).tv_sec == 0)
  
! #define INSTR_TIME_SET_ZERO(t)	((t).tv_sec = 0, (t).tv_usec = 0)
  
! #define INSTR_TIME_SET_CURRENT(t)	gettimeofday(&(t), NULL)
  
! #define INSTR_TIME_ADD(x,y) \
  	do { \
  		(x).tv_sec += (y).tv_sec; \
  		(x).tv_usec += (y).tv_usec; \
--- 56,72 ----
  #ifndef INSTR_TIME_H
  #define INSTR_TIME_H
  
  #include <sys/time.h>
  
  typedef struct timeval instr_time;
  
! #define INSTR_TIME_IS_ZERO_TV(t)	((t).tv_usec == 0 && (t).tv_sec == 0)
  
! #define INSTR_TIME_SET_ZERO_TV(t)	((t).tv_sec = 0, (t).tv_usec = 0)
  
! #define INSTR_TIME_SET_CURRENT_TV(t)	gettimeofday(&(t), NULL)
  
! #define INSTR_TIME_ADD_TV(x,y) \
  	do { \
  		(x).tv_sec += (y).tv_sec; \
  		(x).tv_usec += (y).tv_usec; \
***************
*** 76,82 **** typedef struct timeval instr_time;
  		} \
  	} while (0)
  
! #define INSTR_TIME_SUBTRACT(x,y) \
  	do { \
  		(x).tv_sec -= (y).tv_sec; \
  		(x).tv_usec -= (y).tv_usec; \
--- 78,84 ----
  		} \
  	} while (0)
  
! #define INSTR_TIME_SUBTRACT_TV(x,y) \
  	do { \
  		(x).tv_sec -= (y).tv_sec; \
  		(x).tv_usec -= (y).tv_usec; \
***************
*** 88,94 **** typedef struct timeval instr_time;
  		} \
  	} while (0)
  
! #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
  	do { \
  		(x).tv_sec += (y).tv_sec - (z).tv_sec; \
  		(x).tv_usec += (y).tv_usec - (z).tv_usec; \
--- 90,96 ----
  		} \
  	} while (0)
  
! #define INSTR_TIME_ACCUM_DIFF_TV(x,y,z) \
  	do { \
  		(x).tv_sec += (y).tv_sec - (z).tv_sec; \
  		(x).tv_usec += (y).tv_usec - (z).tv_usec; \
***************
*** 105,119 **** typedef struct timeval instr_time;
  		} \
  	} while (0)
  
! #define INSTR_TIME_GET_DOUBLE(t) \
  	(((double) (t).tv_sec) + ((double) (t).tv_usec) / 1000000.0)
  
! #define INSTR_TIME_GET_MILLISEC(t) \
  	(((double) (t).tv_sec * 1000.0) + ((double) (t).tv_usec) / 1000.0)
  
! #define INSTR_TIME_GET_MICROSEC(t) \
  	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
! #else							/* WIN32 */
  
  typedef LARGE_INTEGER instr_time;
  
--- 107,134 ----
  		} \
  	} while (0)
  
! #define INSTR_TIME_GET_DOUBLE_TV(t) \
  	(((double) (t).tv_sec) + ((double) (t).tv_usec) / 1000000.0)
  
! #define INSTR_TIME_GET_MILLISEC_TV(t) \
  	(((double) (t).tv_sec * 1000.0) + ((double) (t).tv_usec) / 1000.0)
  
! #define INSTR_TIME_GET_MICROSEC_TV(t) \
  	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) (t).tv_usec)
! 
! #ifndef WIN32
! 
! #define INSTR_TIME_IS_ZERO(t) INSTR_TIME_IS_ZERO_TV(t)
! #define INSTR_TIME_SET_ZERO(t) INSTR_TIME_SET_ZERO_TV(t)
! #define INSTR_TIME_SET_CURRENT(t) INSTR_TIME_SET_CURRENT_TV(t)
! #define INSTR_TIME_ADD(x,y) INSTR_TIME_ADD_TV(x,y)
! #define INSTR_TIME_SUBTRACT(x,y) INSTR_TIME_SUBTRACT_TV(x,y)
! #define INSTR_TIME_ACCUM_DIFF(x,y,z) INSTR_TIME_ACCUM_DIFF_TV(x,y,z)
! #define INSTR_TIME_GET_DOUBLE(t) INSTR_TIME_GET_DOUBLE_TV(t)
! #define INSTR_TIME_GET_MILLISEC(t) INSTR_TIME_GET_MILLISEC_TV(t)
! #define INSTR_TIME_GET_MICROSEC(t) INSTR_TIME_GET_MICROSEC_TV(t)
! 
! #else
  
  typedef LARGE_INTEGER instr_time;
  
*** a/src/include/rusagestub.h
--- b/src/include/rusagestub.h
***************
*** 27,32 **** struct rusage
--- 27,46 ----
  {
  	struct timeval ru_utime;	/* user time used */
  	struct timeval ru_stime;	/* system time used */
+ 	long   ru_maxrss;           /* maximum resident set size */
+ 	long   ru_ixrss;            /* integral shared memory size */
+ 	long   ru_idrss;            /* integral unshared data size */
+ 	long   ru_isrss;            /* integral unshared stack size */
+ 	long   ru_minflt;           /* page reclaims */
+ 	long   ru_majflt;           /* page faults */
+ 	long   ru_nswap;            /* swaps */
+ 	long   ru_inblock;          /* block input operations */
+ 	long   ru_oublock;          /* block output operations */
+ 	long   ru_msgsnd;           /* messages sent */
+ 	long   ru_msgrcv;           /* messages received */
+ 	long   ru_nsignals;         /* signals received */
+ 	long   ru_nvcsw;            /* voluntary context switches */
+ 	long   ru_nivcsw;           /* involuntary context switches */
  };
  
  extern int	getrusage(int who, struct rusage * rusage);
*** a/src/port/getrusage.c
--- b/src/port/getrusage.c
***************
*** 40,45 **** getrusage(int who, struct rusage * rusage)
--- 40,46 ----
  	FILETIME	kerneltime;
  	FILETIME	usertime;
  	ULARGE_INTEGER li;
+ 	IOCOUNTERS iocounters;
  
  	if (who != RUSAGE_SELF)
  	{
***************
*** 61,66 **** getrusage(int who, struct rusage * rusage)
--- 62,73 ----
  		return -1;
  	}
  
+ 	if (GetProcessIoCounters(GetCurrentProcess(), &iocounters) == 0)
+ 	{
+ 		_dosmaperr(GetLastError());
+ 		return -1;
+ 	}
+ 
  	/* Convert FILETIMEs (0.1 us) to struct timeval */
  	memcpy(&li, &kerneltime, sizeof(FILETIME));
  	li.QuadPart /= 10L;			/* Convert to microseconds */
***************
*** 71,76 **** getrusage(int who, struct rusage * rusage)
--- 78,87 ----
  	li.QuadPart /= 10L;			/* Convert to microseconds */
  	rusage->ru_utime.tv_sec = li.QuadPart / 1000000L;
  	rusage->ru_utime.tv_usec = li.QuadPart % 1000000L;
+ 
+ 	rusage->ru_inblock = iocounters.ReadTransferCount/512;
+ 	rusage->ru_oublock = iocounters.WriteTransferCount/512;
+ 
  #else							/* all but WIN32 */
  
  	struct tms	tms;
#2Robert Haas
robertmhaas@gmail.com
In reply to: Greg Stark (#1)
Re: Explain analyze getrusage tracking

On Sun, Nov 14, 2010 at 9:33 PM, Greg Stark <stark@mit.edu> wrote:

This is an update to my earlier patch to add getrusage resource
tracking to EXPLAIN ANALYZE.

With this patch you get something like:

                                                 QUERY PLAN
--------------------------------------------------------------------------------------------------------------
 Seq Scan on i  (cost=0.00..6919.44 rows=262144 width=101) (actual
time=17.240..1123.751 rows=262144 loops=1)
  Resources: sys=210.000ms user=430.000ms read=33.6MB
  Buffers: shared read=4298
 Total runtime: 1548.651 ms
(4 rows)

The main change is to make it work under Windows. At least I think the
changes should make it work under Windows, I haven't been able to test
it. Actually I'm not to happy with the way I did it, I would be more
inclined to hack the getrusagestub,h definition of struct rusage to
have an instr_time in it so that we can use the same macros directly.
But that's more changes than I would be happy making without being
able to compile them to test them.

I don't really think these changes to the INSTR macros make much
sense. The macros don't really add much notational convenience;
they're mostly wrappers to make the WIN32 and non-WIN32 cases work
similarly for the instrumentation stuff, so hacking them up to use
them for this doesn't seem like it adds anything. Just do whatever it
is you need to do, or define macros locally in explain.c.

It doesn't make much sense to me to normalize the memory for this
output to a variable unit when the other memory values we use in
explain.c are still going to be printed as kB. I think we should just
print it in kB and call it good. Alternatively, we could apply the
same normalization algorithm across the board, but I don't think
that's as good.

I continue to feel strongly that the choice of EXPLAIN format should
only affect the format, not the choice of information to be displayed.
Using the verbose option to control how much data the resource option
prints is, I think, not a good idea. If you want to have two modes,
one for partial rusage data and one for full rusage data, you can just
as easily implement EXPLAIN (RESOURCE [PARTIAL|FULL]). I believe that
the existing grammar is adequate to support that; you'd just need to
write the appropriate DefElem-parsing code. But personally I'd just
print the whole kit and kaboodle regardless.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#3Greg Stark
stark@mit.edu
In reply to: Robert Haas (#2)
Re: Explain analyze getrusage tracking

On Tue, Nov 16, 2010 at 2:43 AM, Robert Haas <robertmhaas@gmail.com> wrote:

I don't really think these changes to the INSTR macros make much
sense.  The macros don't really add much notational convenience;
they're mostly wrappers to make the WIN32 and non-WIN32 cases work
similarly for the instrumentation stuff, so hacking them up to use
them for this doesn't seem like it adds anything.  Just do whatever it
is you need to do, or define macros locally in explain.c.

Well they're going to be just like the macros in instr_time and I
didn't really want to duplicate them. What's really goofy here is that
on Windows the getrusage portability wrapper is actually consing up
those times from FILETIMEs which I believe are actually just like the
instr_time data type that we're already using in those macros so it's
going of doing a silly dance converting to struct timeval just so it
can use macros that aren't convenient at all on Windows.

I definitely agree this is awkward. I would love to find a cleaner
solution here. I'll try copying the macros and see if that works out
more cleanly.

It doesn't make much sense to me to normalize the memory for this
output to a variable unit when the other memory values we use in
explain.c are still going to be printed as kB.  I think we should just
print it in kB and call it good.  Alternatively, we could apply the
same normalization algorithm across the board, but I don't think
that's as good.

I think we should have a project policy of always printing memory and
disk usage in kB, MB, GB etc unless they're functions returning an
integer intended for machine use. Effectively this is the dual of
accepting units on all our memory gucs. I don't know about others but
I find it pretty hard to read things like 1234567kB and compare it to
125765kB in my head.

I continue to feel strongly that the choice of EXPLAIN format should
only affect the format, not the choice of information to be displayed.

I don't have a good feeling for what should appear in
XML/JSON/whetever. I thought I was just implementing the consensus
which was that there wasn't much point in not including the maximum
level of detail because the consumer could decide what to display. But
having it be the same as the human readable output is also reasonable.
Whatever we're doing elsewhere I'll make this the consistent.

 Using the verbose option to control how much data the resource option
prints is, I think, not a good idea.  If you want to have two modes,
one for partial rusage data and one for full rusage data, you can just
as easily implement EXPLAIN (RESOURCE [PARTIAL|FULL]).  I believe that
the existing grammar is adequate to support that; you'd just need to
write the appropriate DefElem-parsing code.  But personally I'd just
print the whole kit and kaboodle regardless.

This is a separate question. The numbers I'm outputing without VERBOSE
now are the numbers that I understand and which relate to the other
numbers in explain. My intent was that as we understand how the other
numbers relate to Postgres functioning we'll learn which ones to
include. But mostly they're hard to relate to Postgres and the query
execution in any way and just noise that are hard to explain to users.
I'm not even sure which ones it makes sense to sum over time -- is the
integral virtual memory usage over time something that it makes sense
to sum over time? What units is it in?

I don't see why VERBOSE isn't the right key to use. The other things
VERBOSE toggles are also extra detail that's usually useless but might
be useful if you're interested in the inner workings. Things like
which fields are being included at each level of the plan for example.

--
greg

#4Robert Haas
robertmhaas@gmail.com
In reply to: Greg Stark (#3)
Re: Explain analyze getrusage tracking

On Tue, Nov 16, 2010 at 12:19 PM, Greg Stark <stark@mit.edu> wrote:

On Tue, Nov 16, 2010 at 2:43 AM, Robert Haas <robertmhaas@gmail.com> wrote:

I don't really think these changes to the INSTR macros make much
sense. The macros don't really add much notational convenience;
they're mostly wrappers to make the WIN32 and non-WIN32 cases work
similarly for the instrumentation stuff, so hacking them up to use
them for this doesn't seem like it adds anything. Just do whatever it
is you need to do, or define macros locally in explain.c.

Well they're going to be just like the macros in instr_time and I
didn't really want to duplicate them. What's really goofy here is that
on Windows the getrusage portability wrapper is actually consing up
those times from FILETIMEs which I believe are actually just like the
instr_time data type that we're already using in those macros so it's
going of doing a silly dance converting to struct timeval just so it
can use macros that aren't convenient at all on Windows.

I definitely agree this is awkward. I would love to find a cleaner
solution here. I'll try copying the macros and see if that works out
more cleanly.

OK.

It doesn't make much sense to me to normalize the memory for this
output to a variable unit when the other memory values we use in
explain.c are still going to be printed as kB. I think we should just
print it in kB and call it good. Alternatively, we could apply the
same normalization algorithm across the board, but I don't think
that's as good.

I think we should have a project policy of always printing memory and
disk usage in kB, MB, GB etc unless they're functions returning an
integer intended for machine use. Effectively this is the dual of
accepting units on all our memory gucs. I don't know about others but
I find it pretty hard to read things like 1234567kB and compare it to
125765kB in my head.

We have no other place in the system that takes 12345678kB and
converts it inexactly to MB or GB.

rhaas=# set work_mem to '1048576kB';
SET
rhaas=# show work_mem;
work_mem
----------
1GB
(1 row)

rhaas=# set work_mem to '1048577kB';
SET
rhaas=# show work_mem;
work_mem
-----------
1048577kB
(1 row)

We could decide that for text-format EXPLAIN output, an inexact
conversion is OK, but it's still not OK to do it for the new fields
you're adding and not for the existing fields (see the sort and hash
instrumentation). Personally, I'm not really convinced that making
such a change has a lot of value, and I think it should be submitted
as a separate patch and discussed separately, rather than being rolled
in here. But if we are going to change it then we at least need to be
consistent.

Using the verbose option to control how much data the resource option
prints is, I think, not a good idea. If you want to have two modes,
one for partial rusage data and one for full rusage data, you can just
as easily implement EXPLAIN (RESOURCE [PARTIAL|FULL]). I believe that
the existing grammar is adequate to support that; you'd just need to
write the appropriate DefElem-parsing code. But personally I'd just
print the whole kit and kaboodle regardless.

This is a separate question. The numbers I'm outputing without VERBOSE
now are the numbers that I understand and which relate to the other
numbers in explain. My intent was that as we understand how the other
numbers relate to Postgres functioning we'll learn which ones to
include. But mostly they're hard to relate to Postgres and the query
execution in any way and just noise that are hard to explain to users.
I'm not even sure which ones it makes sense to sum over time -- is the
integral virtual memory usage over time something that it makes sense
to sum over time? What units is it in?

I suspect that the ru_maxrss, ru_ixrss, ru_idrss, and ru_isrss values
are useless to us for EXPLAIN purposes because it sounds like they
don't increment over time. The others presumably do, so it makes
sense to show 'em if the system is collecting them.

I don't see why VERBOSE isn't the right key to use. The other things
VERBOSE toggles are also extra detail that's usually useless but might
be useful if you're interested in the inner workings. Things like
which fields are being included at each level of the plan for example.

Yeah, VERBOSE is kind of a catch-all for things that we don't have
individual flags for. But I think it's better for each piece of data
to depend on one setting, rather than a combination of two or more
settings. Otherwise you end up being forced to use VERBOSE and then
you get this deluge of output. I'd actually sort of like to remove
some things from VERBOSE and give them their own settings, rather than
adding more. The fact that VERBOSE turns on "Output:" is particularly
annoying.

...Robert

#5Greg Stark
stark@mit.edu
In reply to: Greg Stark (#1)
Re: Explain analyze getrusage tracking

On Tue, Nov 16, 2010 at 11:38 AM, Robert Haas <robertmhaas@gmail.com> wrote:

I think we should have a project policy of always printing memory and
disk usage in kB, MB, GB etc unless they're functions returning an
integer intended for machine use.

rhaas=# set work_mem to '1048577kB';

Interesting. Though in this case I'll mention the raw data is in 512
byte increments so that would imply having to print .5 sometimes.

We could decide that for text-format EXPLAIN output, an inexact
conversion is OK, but it's still not OK to do it for the new fields
you're adding and not for the existing fields (see the sort and hash
instrumentation).  Personally, I'm not really convinced that making
such a change has a lot of value, and I think it should be submitted
as a separate patch and discussed separately, rather than being rolled
in here.  But if we are going to change it then we at least need to be
consistent.

Well I proposed that as a separate patch back pre-9.0 and it was
problematic. Having a policy of doing x doesn't mean we have to switch
everything to x or do nothing. We can have a policy that it's better
to do x and then go around fixing things as we find them.

But fine, I'll separate that into a separate patch.

I suspect that the ru_maxrss, ru_ixrss, ru_idrss, and ru_isrss values
are useless to us for EXPLAIN purposes because it sounds like they
don't increment over time.  The others presumably do, so it makes
sense to show 'em if the system is collecting them.

I'm not sure, the "integral" part might mean it is ok to sum. I've
never seen it explained anywhere exactly what these represent. But the
other question is whether they have anything to do with the plan
nodes. If Postgres is taking lots of memory or receiving signals or
receiving sysv messages (we don't even use sysv messages so that would
be very strange) it doesn't necessarily have anything to do with the
executor algorithm. They're probably just not very interesting stats
to present to the user.

Yeah, VERBOSE is kind of a catch-all for things that we don't have
individual flags for.  But I think it's better for each piece of data
to depend on one setting, rather than a combination of two or more
settings.  Otherwise you end up being forced to use VERBOSE and then
you get this deluge of output.  I'd actually sort of like to remove
some things from VERBOSE and give them their own settings, rather than
adding more.  The fact that VERBOSE turns on "Output:" is particularly
annoying.

I tend to think it's "don't be clever about showing me just the useful
stuff, include whatever you've got even if you think it's not useful".
I would consider it a bug if there's anything that requires VERBOSE
and a user finds is relevant to a fixing a user problem (as opposed to
debugging a postgres bug).

I'm concerned about the converse problem. The "I why do I have to
include two dozen flags to get postgres to actually include
everything?". Along with the "try turning on this flag and resending
the plan. Hm, oops forgot a flag resend it again with this too. Oh
that's still not including something we need try it a third time with
this flag" etc.

--
greg

#6Robert Haas
robertmhaas@gmail.com
In reply to: Greg Stark (#5)
Re: Explain analyze getrusage tracking

On Tue, Nov 16, 2010 at 2:53 PM, Greg Stark <stark@mit.edu> wrote:

Yeah, VERBOSE is kind of a catch-all for things that we don't have
individual flags for.  But I think it's better for each piece of data
to depend on one setting, rather than a combination of two or more
settings.  Otherwise you end up being forced to use VERBOSE and then
you get this deluge of output.  I'd actually sort of like to remove
some things from VERBOSE and give them their own settings, rather than
adding more.  The fact that VERBOSE turns on "Output:" is particularly
annoying.

I tend to think it's "don't be clever about showing me just the useful
stuff, include whatever you've got even if you think it's not useful".
I would consider it a bug if there's anything that requires VERBOSE
and a user finds is relevant to a fixing a user problem (as opposed to
debugging a postgres bug).

I'm concerned about the converse problem. The "I why do I have to
include two dozen flags to get postgres to actually include
everything?". Along with the "try turning on this flag and resending
the plan. Hm, oops forgot a flag resend it again with this too. Oh
that's still not including something we need try it a third time with
this flag" etc.

Well, the reason's pretty obvious in this case: each of ANALYZE,
BUFFERS, and RESOURCE adds a huge amount of incremental overhead. I
wouldn't object to having a flag that says "just turn on absolutely
everything you've got". Then you could just say:

EXPLAIN (THECRAPOUTOFIT) query...

In any case, this seems like an argument for making RESOURCE print
everything we think might ever be useful, rather than just some subset
of it. A rule that says "print some of the RESOURCE information when
RESOURCE is specified but the rest only if VERBOSE is also specified
or if a non-text output format is used" seems to have exactly the sort
of ickiness you're complaining about; I'd rather make it all or
nothing.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#7Magnus Hagander
magnus@hagander.net
In reply to: Greg Stark (#1)
Re: Explain analyze getrusage tracking

On Mon, Nov 15, 2010 at 03:33, Greg Stark <stark@mit.edu> wrote:

This is an update to my earlier patch to add getrusage resource
tracking to EXPLAIN ANALYZE.

With this patch you get something like:

                                                 QUERY PLAN
--------------------------------------------------------------------------------------------------------------
 Seq Scan on i  (cost=0.00..6919.44 rows=262144 width=101) (actual
time=17.240..1123.751 rows=262144 loops=1)
  Resources: sys=210.000ms user=430.000ms read=33.6MB
  Buffers: shared read=4298
 Total runtime: 1548.651 ms
(4 rows)

The main change is to make it work under Windows. At least I think the
changes should make it work under Windows, I haven't been able to test
it. Actually I'm not to happy with the way I did it, I would be more
inclined to hack the getrusagestub,h definition of struct rusage to
have an instr_time in it so that we can use the same macros directly.
But that's more changes than I would be happy making without being
able to compile them to test them.

I tried building this under windows, and got a bunch of errors.

First and easiest - you need to rename IOCOUNTERS to IO_COUNTERS in
getrusage.c :P

But then I get a number of:
c:\pgsql\src\include\portability/instr_time.h(133) : error C2371: 'instr_time' :
redefinition; different basic types

and
.\src\backend\utils\adt\pgstatfuncs.c(1345) : error C2039: 'QuadPart' : is not a
member of 'timeval'
C:\Program Files\Microsoft SDKs\Windows\v6.1\include\winsock2.h(176) : s
ee declaration of 'timeval'

which believe are related to the same issue. Haven't looked close
enough to figure out what you actually intend for it to be :-)

Finally, a number of:
c:\pgsql\src\include\executor/instrument.h(19) : fatal error C1083: Cannot open
include file: 'sys/resource.h': No such file or directory

include files simply doesn't exist on Windows. Hiding it behind an
#ifdef complains about fields missing in struct rusage in some cases
and lack of existing rusage definition in others. I think you need
includes of pg_rusage.h, which will make sure it brings in
rusagestub.h when necessary and sys/resource.h when it's there?

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#8Greg Stark
stark@mit.edu
In reply to: Magnus Hagander (#7)
Re: Explain analyze getrusage tracking

On Mon, Nov 22, 2010 at 12:40 PM, Magnus Hagander <magnus@hagander.net> wrote:

I tried building this under windows, and got a bunch of errors.

Thanks!

--
greg