Buffer usage in EXPLAIN and pg_stat_statements
Here is a proposal to add buffer usage information to EXPLAIN and
contrib/pg_stat_statements. We can retrieve new values 'gets',
'reads' and 'temp':
- gets : total number of buffer pool access
- reads : total number of data file access
- temp : total number of temp file access (sort)
In EXPLAIN, we can use "EXPLAIN (ANALYZE, BUFFER) ..." syntax.
Each executor node shows buffer usage only in it; parent nodes
don't contain buffer usages in their sub nodes.
In pg_stat_statements, new 3 columns are added to the view.
We can determine queries that consume I/O bandwidth using
"SELECT * FROM pg_stat_statements ORDER BY reads DESC".
We will find out bad queries easily using those buffer and
disk access information in addition to duration statistics.
I implementd this feature using an instrumentation stack. A global
variable CurrentInstrument points top of the stack, and each
Instrumentation are linked with newly added 'prev' field.
The stack must be reset even on error because each innstrumentation
might have been deallocated already. I added codes to reset stack
in main loop of backend for the purpose.
TopInstrument is a special node that sums up all of the child nodes.
It tracks QueryDesc.totaltime and used in pg_stat_statements. There
might be another idea that walking around on planstate tree and
gathering all counters in the module, but very complex codes are
needed. I chose a simple way.
I'll write documentations if this design is accepted.
Comments welcome.
Output samples are below:
[EXPLAIN]
=# EXPLAIN (ANALYZE, BUFFER) SELECT * FROM pgbench_accounts ORDER BY bid;
QUERY PLAN
----------------------------------------------------------
Sort (cost=...) (actual ...) (gets=0 reads=0 temp=1309)
Sort Key: bid
Sort Method: external sort Disk: 10472kB
-> Seq Scan on pgbench_accounts (cost=...) (actual ...) (gets=1798 reads=1644 temp=0)
Total runtime: 75.867 ms
[contrib/pg_stat_statements]
=# SELECT query, gets, reads FROM pg_stat_statements ORDER BY gets DESC LIMIT 4;
query | gets | reads
----------------------------------------------------------------------+-------+-------
UPDATE pgbench_branches SET bbalance = bbalance + $1 WHERE bid = $2; | 58628 | 1
UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = $2; | 26999 | 1929
UPDATE pgbench_tellers SET tbalance = tbalance + $1 WHERE tid = $2; | 25474 | 1
SELECT abalance FROM pgbench_accounts WHERE aid = $1; | 19950 | 0
(4 rows)
Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center
Attachments:
buffer_usage-20090817.patchapplication/octet-stream; name=buffer_usage-20090817.patchDownload
diff -cpr head/contrib/auto_explain/auto_explain.c buffer_usage/contrib/auto_explain/auto_explain.c
*** head/contrib/auto_explain/auto_explain.c 2009-08-10 14:46:49.000000000 +0900
--- buffer_usage/contrib/auto_explain/auto_explain.c 2009-08-17 15:46:39.936395391 +0900
*************** PG_MODULE_MAGIC;
*** 22,27 ****
--- 22,28 ----
static int auto_explain_log_min_duration = -1; /* msec or -1 */
static bool auto_explain_log_analyze = false;
static bool auto_explain_log_verbose = false;
+ static bool auto_explain_log_buffer = false;
static int auto_explain_log_format = EXPLAIN_FORMAT_TEXT;
static bool auto_explain_log_nested_statements = false;
*************** _PG_init(void)
*** 92,97 ****
--- 93,108 ----
NULL,
NULL);
+ DefineCustomBoolVariable("auto_explain.log_buffer",
+ "EXPLAIN buffer usage for plan logging.",
+ NULL,
+ &auto_explain_log_buffer,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL);
+
DefineCustomEnumVariable("auto_explain.log_format",
"EXPLAIN format to be used for plan logging.",
NULL,
*************** explain_ExecutorEnd(QueryDesc *queryDesc
*** 220,225 ****
--- 231,237 ----
ExplainInitState(&es);
es.analyze = (queryDesc->doInstrument && auto_explain_log_analyze);
es.verbose = auto_explain_log_verbose;
+ es.buffer = auto_explain_log_buffer;
es.format = auto_explain_log_format;
ExplainPrintPlan(&es, queryDesc);
diff -cpr head/contrib/pg_stat_statements/pg_stat_statements.c buffer_usage/contrib/pg_stat_statements/pg_stat_statements.c
*** head/contrib/pg_stat_statements/pg_stat_statements.c 2009-07-27 13:09:55.000000000 +0900
--- buffer_usage/contrib/pg_stat_statements/pg_stat_statements.c 2009-08-17 15:28:33.614089715 +0900
***************
*** 26,31 ****
--- 26,32 ----
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "executor/instrument.h"
+ #include "funcapi.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "pgstat.h"
*************** PG_MODULE_MAGIC;
*** 43,49 ****
#define PGSS_DUMP_FILE "global/pg_stat_statements.stat"
/* This constant defines the magic number in the stats file header */
! static const uint32 PGSS_FILE_HEADER = 0x20081202;
/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
#define USAGE_EXEC(duration) (1.0)
--- 44,50 ----
#define PGSS_DUMP_FILE "global/pg_stat_statements.stat"
/* This constant defines the magic number in the stats file header */
! static const uint32 PGSS_FILE_HEADER = 0x20090817;
/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
#define USAGE_EXEC(duration) (1.0)
*************** typedef struct Counters
*** 77,82 ****
--- 78,86 ----
int64 calls; /* # of times executed */
double total_time; /* total execution time in seconds */
int64 rows; /* total # of retrieved or affected rows */
+ int64 gets; /* total # of buffer pool access */
+ int64 reads; /* total # of data file access */
+ int64 temp; /* total # of temp file access (sort) */
double usage; /* usage factor */
} Counters;
*************** pgss_store(const char *query, const Inst
*** 633,638 ****
--- 637,645 ----
e->counters.calls += 1;
e->counters.total_time += instr->total;
e->counters.rows += rows;
+ e->counters.gets += instr->buffer_gets;
+ e->counters.reads += instr->buffer_reads;
+ e->counters.temp += instr->buffer_temp;
e->counters.usage += usage;
SpinLockRelease(&e->mutex);
}
*************** pg_stat_statements_reset(PG_FUNCTION_ARG
*** 654,660 ****
PG_RETURN_VOID();
}
! #define PG_STAT_STATEMENTS_COLS 6
/*
* Retrieve statement statistics.
--- 661,667 ----
PG_RETURN_VOID();
}
! #define PG_STAT_STATEMENTS_COLS 9
/*
* Retrieve statement statistics.
*************** pg_stat_statements(PG_FUNCTION_ARGS)
*** 688,709 ****
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
! tupdesc = CreateTemplateTupleDesc(PG_STAT_STATEMENTS_COLS, false);
! TupleDescInitEntry(tupdesc, (AttrNumber) 1, "userid",
! OIDOID, -1, 0);
! TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dbid",
! OIDOID, -1, 0);
! TupleDescInitEntry(tupdesc, (AttrNumber) 3, "query",
! TEXTOID, -1, 0);
! TupleDescInitEntry(tupdesc, (AttrNumber) 4, "calls",
! INT8OID, -1, 0);
! TupleDescInitEntry(tupdesc, (AttrNumber) 5, "total_time",
! FLOAT8OID, -1, 0);
! TupleDescInitEntry(tupdesc, (AttrNumber) 6, "rows",
! INT8OID, -1, 0);
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
--- 695,708 ----
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
! tupdesc = CreateTupleDescCopy(tupdesc);
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
*************** pg_stat_statements(PG_FUNCTION_ARGS)
*** 757,762 ****
--- 756,764 ----
values[i++] = Int64GetDatumFast(tmp.calls);
values[i++] = Float8GetDatumFast(tmp.total_time);
values[i++] = Int64GetDatumFast(tmp.rows);
+ values[i++] = Int64GetDatumFast(tmp.gets);
+ values[i++] = Int64GetDatumFast(tmp.reads);
+ values[i++] = Int64GetDatumFast(tmp.temp);
Assert(i == PG_STAT_STATEMENTS_COLS);
diff -cpr head/contrib/pg_stat_statements/pg_stat_statements.sql.in buffer_usage/contrib/pg_stat_statements/pg_stat_statements.sql.in
*** head/contrib/pg_stat_statements/pg_stat_statements.sql.in 2009-01-05 07:19:59.000000000 +0900
--- buffer_usage/contrib/pg_stat_statements/pg_stat_statements.sql.in 2009-08-17 13:30:04.064102927 +0900
*************** CREATE FUNCTION pg_stat_statements(
*** 15,21 ****
OUT query text,
OUT calls int8,
OUT total_time float8,
! OUT rows int8
)
RETURNS SETOF record
AS 'MODULE_PATHNAME'
--- 15,24 ----
OUT query text,
OUT calls int8,
OUT total_time float8,
! OUT rows int8,
! OUT gets int8,
! OUT reads int8,
! OUT temp int8
)
RETURNS SETOF record
AS 'MODULE_PATHNAME'
diff -cpr head/src/backend/commands/explain.c buffer_usage/src/backend/commands/explain.c
*** head/src/backend/commands/explain.c 2009-08-10 14:46:50.000000000 +0900
--- buffer_usage/src/backend/commands/explain.c 2009-08-17 15:28:24.841134191 +0900
*************** ExplainQuery(ExplainStmt *stmt, const ch
*** 127,132 ****
--- 127,134 ----
es.verbose = defGetBoolean(opt);
else if (strcmp(opt->defname, "costs") == 0)
es.costs = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "buffer") == 0)
+ es.buffer = defGetBoolean(opt);
else if (strcmp(opt->defname, "format") == 0)
{
char *p = defGetString(opt);
*************** ExplainQuery(ExplainStmt *stmt, const ch
*** 150,155 ****
--- 152,162 ----
opt->defname)));
}
+ if (es.buffer && !es.analyze)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("EXPLAIN option BUFFER requires ANALYZE")));
+
/* Convert parameter type data to the form parser wants */
getParamListTypes(params, ¶m_types, &num_params);
*************** ExplainNode(Plan *plan, PlanState *plans
*** 923,928 ****
--- 930,954 ----
ExplainPropertyFloat("Actual Rows", rows, 0, es);
ExplainPropertyFloat("Actual Loops", nloops, 0, es);
}
+
+ if (es->buffer)
+ {
+ long num_gets = planstate->instrument->buffer_gets;
+ long num_reads = planstate->instrument->buffer_reads;
+ long num_temp = planstate->instrument->buffer_temp;
+
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ appendStringInfo(es->str, " (gets=%ld reads=%ld temp=%ld)",
+ num_gets, num_reads, num_temp);
+ }
+ else
+ {
+ ExplainPropertyLong("Buffer Gets", num_gets, es);
+ ExplainPropertyLong("Buffer Reads", num_reads, es);
+ ExplainPropertyLong("Buffer Temp", num_temp, es);
+ }
+ }
}
else if (es->analyze)
{
diff -cpr head/src/backend/executor/execMain.c buffer_usage/src/backend/executor/execMain.c
*** head/src/backend/executor/execMain.c 2009-07-30 05:56:18.000000000 +0900
--- buffer_usage/src/backend/executor/execMain.c 2009-08-17 15:33:00.323123722 +0900
***************
*** 50,55 ****
--- 50,56 ----
#include "optimizer/clauses.h"
#include "parser/parse_clause.h"
#include "parser/parsetree.h"
+ #include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
*************** standard_ExecutorRun(QueryDesc *queryDes
*** 267,272 ****
--- 268,274 ----
DestReceiver *dest;
bool sendTuples;
MemoryContext oldcontext;
+ Instrumentation *save_TopInstrument = NULL;
/* sanity checks */
Assert(queryDesc != NULL);
*************** standard_ExecutorRun(QueryDesc *queryDes
*** 282,288 ****
--- 284,294 ----
/* Allow instrumentation of ExecutorRun overall runtime */
if (queryDesc->totaltime)
+ {
InstrStartNode(queryDesc->totaltime);
+ save_TopInstrument = TopInstrument;
+ TopInstrument = queryDesc->totaltime;
+ }
/*
* extract information from the query descriptor and the query feature.
*************** standard_ExecutorRun(QueryDesc *queryDes
*** 320,326 ****
--- 326,341 ----
(*dest->rShutdown) (dest);
if (queryDesc->totaltime)
+ {
InstrStopNode(queryDesc->totaltime, estate->es_processed);
+ if (save_TopInstrument)
+ {
+ save_TopInstrument->buffer_gets += queryDesc->totaltime->buffer_gets;
+ save_TopInstrument->buffer_reads += queryDesc->totaltime->buffer_reads;
+ save_TopInstrument->buffer_temp += queryDesc->totaltime->buffer_temp;
+ }
+ TopInstrument = save_TopInstrument;
+ }
MemoryContextSwitchTo(oldcontext);
}
diff -cpr head/src/backend/executor/instrument.c buffer_usage/src/backend/executor/instrument.c
*** head/src/backend/executor/instrument.c 2009-01-02 02:23:41.000000000 +0900
--- buffer_usage/src/backend/executor/instrument.c 2009-08-17 15:36:17.857108838 +0900
***************
*** 16,22 ****
--- 16,26 ----
#include <unistd.h>
#include "executor/instrument.h"
+ #include "storage/buf_internals.h"
+ #include "storage/bufmgr.h"
+ Instrumentation *CurrentInstrument = NULL;
+ Instrumentation *TopInstrument = NULL;
/* Allocate new instrumentation structure(s) */
Instrumentation *
*************** InstrStartNode(Instrumentation *instr)
*** 37,42 ****
--- 41,50 ----
INSTR_TIME_SET_CURRENT(instr->starttime);
else
elog(DEBUG2, "InstrStartNode called twice in a row");
+
+ /* push stack */
+ instr->prev = CurrentInstrument;
+ CurrentInstrument = instr;
}
/* Exit from a plan node */
*************** InstrStopNode(Instrumentation *instr, do
*** 45,50 ****
--- 53,84 ----
{
instr_time endtime;
+ if (instr == CurrentInstrument)
+ {
+ long num_gets = ReadBufferCount + ReadLocalBufferCount;
+ long num_reads = num_gets - BufferHitCount - LocalBufferHitCount;
+ long num_temp = BufFileReadCount;
+
+ /* add to current instrucments */
+ instr->buffer_gets += num_gets;
+ instr->buffer_reads += num_reads;
+ instr->buffer_temp += num_temp;
+
+ /* add to top instrucments, too */
+ if (TopInstrument && TopInstrument != CurrentInstrument)
+ {
+ TopInstrument->buffer_gets += num_gets;
+ TopInstrument->buffer_reads += num_reads;
+ TopInstrument->buffer_temp += num_temp;
+ }
+
+ /* reset buffer usage and pop stack */
+ ResetLocalBufferUsage();
+ CurrentInstrument = instr->prev;
+ }
+ else
+ elog(WARNING, "Instrumentation stack is broken");
+
/* count the returned tuples */
instr->tuplecount += nTuples;
diff -cpr head/src/backend/storage/buffer/bufmgr.c buffer_usage/src/backend/storage/buffer/bufmgr.c
*** head/src/backend/storage/buffer/bufmgr.c 2009-06-11 23:49:01.000000000 +0900
--- buffer_usage/src/backend/storage/buffer/bufmgr.c 2009-08-17 15:14:10.747096730 +0900
*************** static bool IsForInput;
*** 79,84 ****
--- 79,94 ----
/* local state for LockBufferForCleanup */
static volatile BufferDesc *PinCountWaitBuf = NULL;
+ /* for log_[parser|planner|executor|statement]_stats */
+ static long GlobalReadBufferCount;
+ static long GlobalReadLocalBufferCount;
+ static long GlobalBufferHitCount;
+ static long GlobalLocalBufferHitCount;
+ static long GlobalBufferFlushCount;
+ static long GlobalLocalBufferFlushCount;
+ static long GlobalBufFileReadCount;
+ static long GlobalBufFileWriteCount;
+
static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
ForkNumber forkNum, BlockNumber blockNum,
*************** ShowBufferUsage(void)
*** 1620,1646 ****
float hitrate;
float localhitrate;
initStringInfo(&str);
! if (ReadBufferCount == 0)
hitrate = 0.0;
else
! hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
! if (ReadLocalBufferCount == 0)
localhitrate = 0.0;
else
! localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
appendStringInfo(&str,
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
! ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
appendStringInfo(&str,
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
! ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
appendStringInfo(&str,
"!\tDirect blocks: %10ld read, %10ld written\n",
! BufFileReadCount, BufFileWriteCount);
return str.data;
}
--- 1630,1658 ----
float hitrate;
float localhitrate;
+ ResetLocalBufferUsage();
+
initStringInfo(&str);
! if (GlobalReadBufferCount == 0)
hitrate = 0.0;
else
! hitrate = (float) GlobalBufferHitCount *100.0 / GlobalReadBufferCount;
! if (GlobalReadLocalBufferCount == 0)
localhitrate = 0.0;
else
! localhitrate = (float) GlobalLocalBufferHitCount *100.0 / GlobalReadLocalBufferCount;
appendStringInfo(&str,
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
! GlobalReadBufferCount - GlobalBufferHitCount, GlobalBufferFlushCount, hitrate);
appendStringInfo(&str,
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
! GlobalReadLocalBufferCount - GlobalLocalBufferHitCount, GlobalLocalBufferFlushCount, localhitrate);
appendStringInfo(&str,
"!\tDirect blocks: %10ld read, %10ld written\n",
! GlobalBufFileReadCount, GlobalBufFileWriteCount);
return str.data;
}
*************** ResetBufferUsage(void)
*** 1656,1661 ****
--- 1668,1704 ----
LocalBufferFlushCount = 0;
BufFileReadCount = 0;
BufFileWriteCount = 0;
+
+ GlobalBufferHitCount = 0;
+ GlobalReadBufferCount = 0;
+ GlobalBufferFlushCount = 0;
+ GlobalLocalBufferHitCount = 0;
+ GlobalReadLocalBufferCount = 0;
+ GlobalLocalBufferFlushCount = 0;
+ GlobalBufFileReadCount = 0;
+ GlobalBufFileWriteCount = 0;
+ }
+
+ void
+ ResetLocalBufferUsage(void)
+ {
+ GlobalReadBufferCount += ReadBufferCount;
+ GlobalReadLocalBufferCount += ReadLocalBufferCount;
+ GlobalBufferHitCount += BufferHitCount;
+ GlobalLocalBufferHitCount += LocalBufferHitCount;
+ GlobalBufferFlushCount += BufferFlushCount;
+ GlobalLocalBufferFlushCount += LocalBufferFlushCount;
+ GlobalBufFileReadCount += BufFileReadCount;
+ GlobalBufFileWriteCount += BufFileWriteCount;
+
+ BufferHitCount = 0;
+ ReadBufferCount = 0;
+ BufferFlushCount = 0;
+ LocalBufferHitCount = 0;
+ ReadLocalBufferCount = 0;
+ LocalBufferFlushCount = 0;
+ BufFileReadCount = 0;
+ BufFileWriteCount = 0;
}
/*
diff -cpr head/src/backend/tcop/postgres.c buffer_usage/src/backend/tcop/postgres.c
*** head/src/backend/tcop/postgres.c 2009-08-01 05:26:23.000000000 +0900
--- buffer_usage/src/backend/tcop/postgres.c 2009-08-17 15:32:25.347164961 +0900
***************
*** 44,49 ****
--- 44,50 ----
#include "catalog/pg_type.h"
#include "commands/async.h"
#include "commands/prepare.h"
+ #include "executor/instrument.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "libpq/pqsignal.h"
*************** PostgresMain(int argc, char *argv[], con
*** 3524,3529 ****
--- 3525,3534 ----
*/
doing_extended_query_message = false;
+ /* Reset buffer usage counters */
+ CurrentInstrument = TopInstrument = NULL;
+ ResetLocalBufferUsage();
+
/*
* Release storage left over from prior query cycle, and create a new
* query input buffer in the cleared MessageContext.
diff -cpr head/src/include/commands/explain.h buffer_usage/src/include/commands/explain.h
*** head/src/include/commands/explain.h 2009-08-10 14:46:50.000000000 +0900
--- buffer_usage/src/include/commands/explain.h 2009-08-17 12:05:08.156395775 +0900
*************** typedef struct ExplainState
*** 29,34 ****
--- 29,35 ----
bool verbose; /* be verbose */
bool analyze; /* print actual times */
bool costs; /* print costs */
+ bool buffer; /* print buffer stats */
ExplainFormat format; /* output format */
/* other states */
PlannedStmt *pstmt; /* top of plan */
diff -cpr head/src/include/executor/instrument.h buffer_usage/src/include/executor/instrument.h
*** head/src/include/executor/instrument.h 2009-01-02 02:23:59.000000000 +0900
--- buffer_usage/src/include/executor/instrument.h 2009-08-17 15:44:54.821438586 +0900
*************** typedef struct Instrumentation
*** 29,36 ****
--- 29,45 ----
double total; /* Total total time (in seconds) */
double ntuples; /* Total tuples produced */
double nloops; /* # of run cycles for this node */
+ /* Buffer usage */
+ long buffer_gets; /* # of buffer reads */
+ long buffer_reads; /* # of disk reads */
+ long buffer_temp; /* # of temp file reads */
+ /* previous node in stack */
+ struct Instrumentation *prev;
} Instrumentation;
+ extern Instrumentation *CurrentInstrument;
+ extern Instrumentation *TopInstrument;
+
extern Instrumentation *InstrAlloc(int n);
extern void InstrStartNode(Instrumentation *instr);
extern void InstrStopNode(Instrumentation *instr, double nTuples);
diff -cpr head/src/include/storage/bufmgr.h buffer_usage/src/include/storage/bufmgr.h
*** head/src/include/storage/bufmgr.h 2009-06-11 23:49:12.000000000 +0900
--- buffer_usage/src/include/storage/bufmgr.h 2009-08-17 15:28:59.853135890 +0900
*************** extern void InitBufferPoolAccess(void);
*** 175,180 ****
--- 175,181 ----
extern void InitBufferPoolBackend(void);
extern char *ShowBufferUsage(void);
extern void ResetBufferUsage(void);
+ extern void ResetLocalBufferUsage(void);
extern void AtEOXact_Buffers(bool isCommit);
extern void PrintBufferLeakWarning(Buffer buffer);
extern void CheckPointBuffers(int flags);
2009/8/17 Itagaki Takahiro <itagaki.takahiro@oss.ntt.co.jp>:
Here is a proposal to add buffer usage information to EXPLAIN and
contrib/pg_stat_statements. We can retrieve new values 'gets',
'reads' and 'temp':- gets : total number of buffer pool access
- reads : total number of data file access
- temp : total number of temp file access (sort)In EXPLAIN, we can use "EXPLAIN (ANALYZE, BUFFER) ..." syntax.
Each executor node shows buffer usage only in it; parent nodes
don't contain buffer usages in their sub nodes.In pg_stat_statements, new 3 columns are added to the view.
We can determine queries that consume I/O bandwidth using
"SELECT * FROM pg_stat_statements ORDER BY reads DESC".
We will find out bad queries easily using those buffer and
disk access information in addition to duration statistics.I implementd this feature using an instrumentation stack. A global
variable CurrentInstrument points top of the stack, and each
Instrumentation are linked with newly added 'prev' field.
The stack must be reset even on error because each innstrumentation
might have been deallocated already. I added codes to reset stack
in main loop of backend for the purpose.TopInstrument is a special node that sums up all of the child nodes.
It tracks QueryDesc.totaltime and used in pg_stat_statements. There
might be another idea that walking around on planstate tree and
gathering all counters in the module, but very complex codes are
needed. I chose a simple way.I'll write documentations if this design is accepted.
Comments welcome.Output samples are below:
[EXPLAIN]
=# EXPLAIN (ANALYZE, BUFFER) SELECT * FROM pgbench_accounts ORDER BY bid;
QUERY PLAN
----------------------------------------------------------
Sort (cost=...) (actual ...) (gets=0 reads=0 temp=1309)
Sort Key: bid
Sort Method: external sort Disk: 10472kB
-> Seq Scan on pgbench_accounts (cost=...) (actual ...) (gets=1798 reads=1644 temp=0)
Total runtime: 75.867 ms[contrib/pg_stat_statements]
=# SELECT query, gets, reads FROM pg_stat_statements ORDER BY gets DESC LIMIT 4;
query | gets | reads
----------------------------------------------------------------------+-------+-------
UPDATE pgbench_branches SET bbalance = bbalance + $1 WHERE bid = $2; | 58628 | 1
UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = $2; | 26999 | 1929
UPDATE pgbench_tellers SET tbalance = tbalance + $1 WHERE tid = $2; | 25474 | 1
SELECT abalance FROM pgbench_accounts WHERE aid = $1; | 19950 | 0
(4 rows)
It's should be nice
Pavel
Show quoted text
Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers