From a59c9778b9a1cdd98d0ea58f1a6a391efa84b007 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH v7 5/8] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level QueryInstrumentation struct, and its associated InstrQueryStart/InstrQueryStop helpers (which use resource owners to handle aborts), or the Instrumentation struct itself with dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. This also drops the global pgBufferUsage, any callers interested in measuring buffer activity should instead utilize InstrStart/InstrStop. The related global pgWalUsage is kept for now due to its use in pgstat to track aggregate WAL activity and heap_page_prune_and_freeze for measuring FPIs. Author: Lukas Fittl Reviewed-by: Discussion: --- contrib/auto_explain/auto_explain.c | 16 +- .../pg_stat_statements/pg_stat_statements.c | 87 +--- src/backend/access/brin/brin.c | 10 +- src/backend/access/gin/gininsert.c | 10 +- src/backend/access/heap/vacuumlazy.c | 15 +- src/backend/access/nbtree/nbtsort.c | 10 +- src/backend/commands/analyze.c | 31 +- src/backend/commands/explain.c | 26 +- src/backend/commands/explain_dr.c | 31 +- src/backend/commands/prepare.c | 27 +- src/backend/commands/vacuumparallel.c | 10 +- src/backend/executor/execMain.c | 66 ++- src/backend/executor/execParallel.c | 8 +- src/backend/executor/execProcnode.c | 84 +++- src/backend/executor/instrument.c | 389 +++++++++++++----- src/backend/storage/buffer/bufmgr.c | 6 +- src/backend/utils/activity/pgstat_io.c | 6 +- src/include/executor/execdesc.h | 4 +- src/include/executor/executor.h | 2 + src/include/executor/instrument.h | 179 +++++++- src/include/utils/resowner.h | 1 + src/tools/pgindent/typedefs.list | 2 + 22 files changed, 720 insertions(+), 300 deletions(-) diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index 39bf2543b70..4be81489ff4 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) if (auto_explain_enabled()) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -382,7 +372,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc) oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); /* Log plan if duration is exceeded. */ - msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); + msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total); if (msec >= auto_explain_log_min_duration) { ExplainState *es = NewExplainState(); diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 388b068ccec..8448f9c13fa 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -909,22 +909,11 @@ pgss_planner(Query *parse, && pgss_track_planning && query_string && parse->queryId != INT64CONST(0)) { - instr_time start; - instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + /* Track time and buffer/WAL usage as the planner can access them. */ + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -938,30 +927,20 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, parse->stmt_len, PGSS_PLAN, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), 0, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, @@ -1013,19 +992,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) */ if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0)) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -1087,10 +1056,10 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) queryDesc->plannedstmt->stmt_location, queryDesc->plannedstmt->stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), + INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + &queryDesc->totaltime->instr.bufusage, + &queryDesc->totaltime->instr.walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1154,17 +1123,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, !IsA(parsetree, ExecuteStmt) && !IsA(parsetree, PrepareStmt)) { - instr_time start; - instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -1180,6 +1143,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); @@ -1194,9 +1158,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * former value, which'd otherwise be a good idea. */ - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - /* * Track the total number of rows retrieved or affected by the utility * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED @@ -1208,23 +1169,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, saved_stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), rows, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 146ee97a47d..5ab571b29fa 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2435,8 +2435,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2887,6 +2887,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2936,7 +2937,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2951,7 +2952,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c7e38dbe193..bb91cc600eb 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -984,8 +984,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2110,6 +2110,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2178,7 +2179,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2193,7 +2194,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 82c5b28e0ad..b4cbd0e682c 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -641,8 +641,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + QueryInstrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -658,6 +657,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -983,14 +984,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + instr = InstrQueryStopFinalize(instr); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -999,12 +1000,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufferusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); total_blks_hit = bufferusage.shared_blks_hit + bufferusage.local_blks_hit; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 69ef1527e06..dfe4fd9459c 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1465,8 +1465,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -1752,6 +1752,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1827,7 +1828,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1837,7 +1838,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 53adac9139b..38f8b379fa4 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + QueryInstrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + instr = InstrQueryStopFinalize(instr); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 389181b8d9b..aa76f68bd10 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrQueryAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrQueryAlloc(INSTRUMENT_TIMER); + if (es->memory) { /* @@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + instr = InstrQueryStopFinalize(instr); if (es->memory) { @@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 3c96061cf32..6868d8972ac 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + QueryInstrumentation *instr = NULL; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (myState->es->timing || myState->es->buffers) + { + InstrumentOption instrument_options = 0; + + if (myState->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (myState->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + instr = InstrQueryAlloc(instrument_options); + InstrQueryStart(instr); + } /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); + if (myState->es->timing || myState->es->buffers) + instr = InstrQueryStopFinalize(instr); + /* Update timing data */ if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } + INSTR_TIME_ADD(myState->metrics.timeSpent, instr->instr.total); /* Update buffer metrics */ if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + BufferUsageAdd(&myState->metrics.bufferUsage, &instr->instr.bufusage); return true; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 5b86a727587..d81f6b30e9c 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + int instrument_options = INSTRUMENT_TIMER; + + if (es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + + instr = InstrQueryAlloc(instrument_options); if (es->memory) { @@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + instr = InstrQueryStopFinalize(instr); if (es->memory) { @@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 279108ca89f..75074fe4efa 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -308,8 +308,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgBufferUsage or - * pgWalUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -995,6 +995,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1084,7 +1085,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1092,7 +1093,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index c0b174cfbc0..82253317e96 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -329,9 +330,28 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrQueryAlloc(estate->es_instrument); + if (queryDesc->totaltime) - InstrStart(queryDesc->totaltime); + { + /* Allow instrumentation of Executor overall runtime */ + InstrQueryStart(queryDesc->totaltime); + + /* + * Remember all node entries for abort recovery. We do this once here + * after the first call to InstrQueryStart has pushed the parent + * entry. + */ + if ((estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) && + !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime); + } /* * extract information from the query descriptor and the query feature. @@ -383,7 +403,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrQueryStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); } @@ -433,7 +453,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStart(queryDesc->totaltime); + InstrQueryStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -442,8 +462,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + queryDesc->totaltime = InstrQueryStopFinalize(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); @@ -1484,6 +1522,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti && (ti->instr.need_bufusage || ti->instr.need_walusage)) + InstrAccum(instr_stack.current, &ti->instr); + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index c153d5c1c3b..e6ad86cb887 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -694,7 +694,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, * * If EXPLAIN is not in use and there are no extensions loaded that care, * we could skip this. But we have no way of knowing whether anyone's - * looking at pgBufferUsage, so do it unconditionally. + * looking at instrumentation, so do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -1456,6 +1456,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1516,7 +1517,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1532,7 +1533,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 1846661b503..c788b5b00f9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,8 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -787,10 +789,10 @@ ExecShutdownNode_walker(PlanState *node, void *context) * at least once already. We don't expect much CPU consumption during * node shutdown, but in the case of Gather or Gather Merge, we may shut * down workers at this stage. If so, their buffer usage will get - * propagated into pgBufferUsage at this point, and we want to make sure - * that it gets associated with the Gather node. We skip this if the node - * has never been executed, so as to avoid incorrectly making it appear - * that it has. + * propagated into the current instrumentation stack entry at this point, + * and we want to make sure that it gets associated with the Gather node. + * We skip this if the node has never been executed, so as to avoid + * incorrectly making it appear that it has. */ if (node->instrument && node->instrument->running) InstrStartNode(node->instrument); @@ -828,6 +830,80 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation entries as unfinalized children of + * the executor's instrumentation. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node entry to recover buffer/WAL + * data from nodes that never got finalized, that someone might be interested + * in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + QueryInstrumentation *parent = (QueryInstrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument) + InstrQueryRememberNode(parent, node->instrument); + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + Instrumentation *parent = (Instrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our + * instrumentation as the parent context. This ensures children can + * accumulate to us even if they were never executed by the leader (e.g. + * nodes beneath Gather that only workers ran). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->instr : parent); + + if (!node->instrument) + return false; + + node->instrument = InstrFinalizeNode(node->instrument, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 6a4a08ebb0c..1afa5e94960 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,25 +16,31 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" -BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +Instrumentation instr_top; +InstrStackState instr_stack = {0, 0, NULL, &instr_top}; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); -static void WalUsageAdd(WalUsage *dst, WalUsage *add); - - -/* General purpose instrumentation handling */ -Instrumentation * -InstrAlloc(int instrument_options) +void +InstrStackGrow(void) { - Instrumentation *instr = palloc0(sizeof(Instrumentation)); - InstrInitOptions(instr, instrument_options); - return instr; + if (instr_stack.entries == NULL) + { + instr_stack.stack_space = 10; /* Allocate sufficient initial space + * for typical activity */ + instr_stack.entries = MemoryContextAlloc(TopMemoryContext, + sizeof(Instrumentation *) * instr_stack.stack_space); + } + else + { + instr_stack.stack_space *= 2; + instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, instr_stack.stack_space); + } } +/* General purpose instrumentation handling */ void InstrInitOptions(Instrumentation *instr, int instrument_options) { @@ -54,38 +60,249 @@ InstrStart(Instrumentation *instr) INSTR_TIME_SET_CURRENT(instr->starttime); } - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + InstrPushStack(instr); +} + +static void +InstrStopTimer(Instrumentation *instr) +{ + instr_time endtime; + + /* let's update the time only if the timer was requested */ + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStop called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + INSTR_TIME_SET_ZERO(instr->starttime); } void InstrStop(Instrumentation *instr) { - instr_time endtime; + if (instr->need_timer) + InstrStopTimer(instr); + + if (instr->need_bufusage || instr->need_walusage) + InstrPopStack(instr); +} + +/* + * Stops instrumentation, finalizes the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not the current + * top, as can happen with PG_FINALLY, or resource owners, which don't have a + * guaranteed cleanup order. + * + * We are careful here to achieve two goals: + * + * 1) Reset the stack to the parent of whichever of the released stack entries + * has the lowest index + * 2) Accumulate all instrumentation to the currently active instrumentation, + * so that callers get a complete picture of activity, even after an abort + */ +void +InstrStopFinalize(Instrumentation *instr) +{ + int idx = -1; + + for (int i = instr_stack.stack_size - 1; i >= 0; i--) + { + if (instr_stack.entries[i] == instr) + { + idx = i; + break; + } + } + + if (idx >= 0) + { + while (instr_stack.stack_size > idx + 1) + instr_stack.stack_size--; + + InstrPopStack(instr); + } - /* let's update the time only if the timer was requested */ if (instr->need_timer) + InstrStopTimer(instr); + + InstrAccum(instr_stack.current, instr); +} + + +/* Query instrumentation handling */ + +/* + * Use ResourceOwner mechanism to correctly reset instr_stack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res); + slist_mutable_iter iter; + + /* Accumulate data from all unfinalized child node entries. */ + slist_foreach_modify(iter, &qinstr->unfinalized_children) { - if (INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStop called without start"); + NodeInstrumentation *child = slist_container(NodeInstrumentation, unfinalized_node, iter.cur); - INSTR_TIME_SET_CURRENT(endtime); - INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + InstrAccum(&qinstr->instr, &child->instr); - INSTR_TIME_SET_ZERO(instr->starttime); + /* + * Free NodeInstrumentation now, since InstrFinalizeNode won't be + * called + */ + pfree(child); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + /* Ensure the stack is reset as expected, and we accumulate to the parent */ + InstrStopFinalize(&qinstr->instr); + + /* Free QueryInstrumentation now, since InstrStop won't be called */ + pfree(qinstr); +} + +/* + * Allocate in TopMemoryContext so that the Instrumentation survives + * transaction abort — ResourceOwner release needs to access it. + */ +QueryInstrumentation * +InstrQueryAlloc(int instrument_options) +{ + QueryInstrumentation *instr = MemoryContextAllocZero(TopMemoryContext, sizeof(QueryInstrumentation)); + + InstrInitOptions(&instr->instr, instrument_options); + return instr; +} + +void +InstrQueryStart(QueryInstrumentation *qinstr) +{ + InstrStart(&qinstr->instr); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) + { + Assert(CurrentResourceOwner != NULL); + qinstr->owner = CurrentResourceOwner; + + ResourceOwnerEnlarge(qinstr->owner); + ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr); + } +} + +void +InstrQueryStop(QueryInstrumentation *qinstr) +{ + InstrStop(&qinstr->instr); + + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) + { + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + } +} + +QueryInstrumentation * +InstrQueryStopFinalize(QueryInstrumentation *qinstr) +{ + QueryInstrumentation *copy; + + InstrStopFinalize(&qinstr->instr); + + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) + { + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + } + + /* + * Copy to the current memory context so the caller doesn't need to + * explicitly free the TopMemoryContext allocation. + */ + copy = palloc(sizeof(QueryInstrumentation)); + memcpy(copy, qinstr, sizeof(QueryInstrumentation)); + pfree(qinstr); + return copy; +} + +/* + * Register a child NodeInstrumentation entry for abort processing. + * + * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to + * recover buffer/WAL data from entries that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + */ +void +InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *child) +{ + if (child->instr.need_bufusage || child->instr.need_walusage) + slist_push_head(&parent->unfinalized_children, &child->unfinalized_node); +} + +/* start instrumentation during parallel executor startup */ +QueryInstrumentation * +InstrStartParallelQuery(void) +{ + QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrQueryStart(qinstr); + return qinstr; +} + +/* report usage after parallel executor shutdown */ +void +InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage) +{ + qinstr = InstrQueryStopFinalize(qinstr); + memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage)); + memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage)); +} + +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If it's active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ +void +InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +{ + BufferUsageAdd(&instr_stack.current->bufusage, bufusage); + WalUsageAdd(&instr_stack.current->walusage, walusage); + + WalUsageAdd(&pgWalUsage, walusage); } /* Node instrumentation handling */ @@ -94,7 +311,13 @@ InstrStop(Instrumentation *instr) NodeInstrumentation * InstrAllocNode(int instrument_options, bool async_mode) { - NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation)); + /* + * We can utilize TopTransactionContext instead of TopMemoryContext here + * because nodes don't get used for utility commands that restart + * transactions, which would require a context that survives longer + * (EXPLAIN ANALYZE is fine). + */ + NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation)); InstrInitNode(instr, instrument_options); instr->async_mode = async_mode; @@ -117,6 +340,7 @@ InstrStartNode(NodeInstrumentation *instr) InstrStart(&instr->instr); } + /* Exit from a plan node */ void InstrStopNode(NodeInstrumentation *instr, double nTuples) @@ -146,14 +370,12 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->instr.starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->instr.need_bufusage) - BufferUsageAccumDiff(&instr->instr.bufusage, - &pgBufferUsage, &instr->instr.bufusage_start); - - if (instr->instr.need_walusage) - WalUsageAccumDiff(&instr->instr.walusage, - &pgWalUsage, &instr->instr.walusage_start); + /* + * Only pop the stack, accumulation runs in + * ExecFinalizeNodeInstrumentation + */ + if (instr->instr.need_bufusage || instr->instr.need_walusage) + InstrPopStack(&instr->instr); /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -172,6 +394,22 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) } } +/* Add per-node instrumentation to the parent and move into per-query memory context */ +NodeInstrumentation * +InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent) +{ + NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation)); + + memcpy(dst, instr, sizeof(NodeInstrumentation)); + pfree(instr); + + /* Accumulate node's buffer/WAL usage to the parent */ + if (dst->instr.need_bufusage || dst->instr.need_walusage) + InstrAccum(parent, &dst->instr); + + return dst; +} + /* Update tuple count */ void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) @@ -188,8 +426,8 @@ InstrEndLoop(NodeInstrumentation *instr) if (!instr->running) return; - if (!INSTR_TIME_IS_ZERO(instr->instr.starttime)) - elog(ERROR, "InstrEndLoop called on running node"); + /* Ensure InstrNodeStop was called */ + Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime)); /* Accumulate per-cycle statistics into totals */ INSTR_TIME_ADD(instr->startup, instr->firsttuple); @@ -259,38 +497,27 @@ InstrStartTrigger(TriggerInstrumentation *tginstr) void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) { + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ InstrStop(&tginstr->instr); tginstr->firings += firings; } -/* note current values during parallel executor startup */ -void -InstrStartParallelQuery(void) -{ - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; -} - -/* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrAccum(Instrumentation *dst, Instrumentation *add) { - memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); - memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); -} + Assert(dst != NULL); + Assert(add != NULL); -/* accumulate work done by workers in leader's stats */ -void -InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) -{ - BufferUsageAdd(&pgBufferUsage, bufusage); - WalUsageAdd(&pgWalUsage, walusage); + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -311,39 +538,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ +/* dst += add */ void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - -/* helper functions for WAL usage accumulation */ -static void -WalUsageAdd(WalUsage *dst, WalUsage *add) +WalUsageAdd(WalUsage *dst, const WalUsage *add) { dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 59b098ba0a0..1ff50aecc10 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1261,9 +1261,9 @@ PinBufferForBlock(Relation rel, if (rel) { /* - * While pgBufferUsage's "read" counter isn't bumped unless we reach - * WaitReadBuffers() (so, not for hits, and not for buffers that are - * zeroed instead), the per-relation stats always count them. + * While the current buffer usage "read" counter isn't bumped unless + * we reach WaitReadBuffers() (so, not for hits, and not for buffers + * that are zeroed instead), the per-relation stats always count them. */ pgstat_count_buffer_read(rel); if (*foundPtr) diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 9e7a88ec0d0..60400f0c81f 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc) * pg_stat_database only counts block read and write times, these are done for * IOOP_READ, IOOP_WRITE and IOOP_EXTEND. * - * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats - * for shared, local and temporary blocks. pg_stat_io does not track the - * activity of temporary blocks, so these are ignored here. + * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has + * write and read stats for shared, local and temporary blocks. pg_stat_io + * does not track the activity of temporary blocks, so these are ignored here. */ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index d3a57242844..340029a2034 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -51,8 +51,8 @@ typedef struct QueryDesc /* This field is set by ExecutePlan */ bool already_executed; /* true if previously executed */ - /* This is always set NULL by the core system, but plugins can change it */ - struct Instrumentation *totaltime; /* total time spent in ExecutorRun */ + /* This field is set by ExecutorRun, or plugins */ + struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */ } QueryDesc; /* in pquery.c */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index d46ba59895d..c22199c6869 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 1139be8333e..cc33b32af1e 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -68,10 +69,22 @@ typedef enum InstrumentOption } InstrumentOption; /* - * General purpose instrumentation that can capture time and WAL/buffer usage + * Instrumentation base class for capturing time and WAL/buffer usage * - * Initialized through InstrAlloc, followed by one or more calls to a pair of - * InstrStart/InstrStop (activity is measured inbetween). + * If used directly: + * - Allocate on the stack and zero initialize the struct + * - Call InstrInitOptions to set instrumentation options + * - Call InstrStart before the activity you want to measure + * - Call InstrStop / InstrStopFinalize after the activity to capture totals + * + * InstrStart/InstrStop may be called multiple times. The last stop call must + * be to InstrStopFinalize to ensure parent stack entries get the accumulated + * totals. If there is risk of transaction aborts you must call + * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the + * instrumentation stack. + * + * In a query context use QueryInstrumentation instead, which handles aborts + * using the resource owner logic. */ typedef struct Instrumentation { @@ -81,16 +94,52 @@ typedef struct Instrumentation bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ BufferUsage bufusage; /* total buffer usage */ WalUsage walusage; /* total WAL usage */ } Instrumentation; +/* + * Query-related instrumentation tracking. + * + * Usage: + * - Allocate on the heap using InstrQueryAlloc (required for abort handling) + * - Call InstrQueryStart before the activity you want to measure + * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals + * + * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop + * call must be to InstrQueryStopFinalize to ensure parent stack entries get + * the accumulated totals. + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of the top level transaction after having called + * InstrQueryStart, without first calling InstrQueryStop. In the case of a + * transaction abort, logic equivalent to InstrQueryStop will be called + * automatically. + */ +struct ResourceOwnerData; +typedef struct QueryInstrumentation +{ + Instrumentation instr; + + /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */ + struct ResourceOwnerData *owner; + + /* + * NodeInstrumentation child entries that need to be cleaned up on abort, + * since they are not registered as a resource owner themselves. + */ + slist_head unfinalized_children; /* head of unfinalized children list */ +} QueryInstrumentation; + /* * Specialized instrumentation for per-node execution statistics + * + * Relies on an outer QueryInstrumentation having been set up to handle the + * stack used for WAL/buffer usage statistics, and relies on it for managing + * aborts. Solely intended for the executor and anyone reporting about its + * activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -109,8 +158,15 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ + + /* Abort handling */ + slist_node unfinalized_node; /* node in parent's unfinalized list */ } NodeInstrumentation; +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ @@ -124,19 +180,110 @@ typedef struct TriggerInstrumentation * was fired */ } TriggerInstrumentation; -extern PGDLLIMPORT BufferUsage pgBufferUsage; +/* + * Dynamic array-based stack for tracking current WAL/buffer usage context. + * + * When the stack is empty, 'current' points to instr_top which accumulates + * session-level totals. + */ +typedef struct InstrStackState +{ + int stack_space; /* allocated capacity of entries array */ + int stack_size; /* current number of entries */ + + Instrumentation **entries; /* dynamic array of pointers */ + Instrumentation *current; /* top of stack, or &instr_top when empty */ +} InstrStackState; + extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int instrument_options); +/* + * The top instrumentation represents a running total of the current backend + * WAL/buffer usage information. This will not be updated immediately, but + * rather when the current stack entry gets accumulated which typically happens + * at query end. + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT Instrumentation instr_top; + +/* + * The instrumentation stack state. The 'current' field points to the + * currently active stack entry that is getting updated as activity happens, + * and will be accumulated to parent stacks when it gets finalized by + * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStackState instr_stack; + +extern void InstrStackGrow(void); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * instrumentation entry. + * + * Any caller using this directly must manage the passed in entry and call + * InstrPopStack on its own again, typically by using a PG_FINALLY block to + * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart + * instead when you want automatic handling of abort cases using the resource + * owner infrastructure. + */ +static inline void +InstrPushStack(Instrumentation *instr) +{ + if (unlikely(instr_stack.stack_size == instr_stack.stack_space)) + InstrStackGrow(); + + instr_stack.entries[instr_stack.stack_size++] = instr; + instr_stack.current = instr; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, instead call InstrPopAndFinalizeStack which can skip intermediate + * stack entries, or instead use InstrStart/InstrStop. + */ +static inline void +InstrPopStack(Instrumentation *instr) +{ + Assert(instr_stack.stack_size > 0); + Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr); + instr_stack.stack_size--; + instr_stack.current = instr_stack.stack_size > 0 + ? instr_stack.entries[instr_stack.stack_size - 1] + : &instr_top; +} + extern void InstrInitOptions(Instrumentation *instr, int instrument_options); extern void InstrStart(Instrumentation *instr); extern void InstrStop(Instrumentation *instr); +extern void InstrStopFinalize(Instrumentation *instr); +extern void InstrAccum(Instrumentation *dst, Instrumentation *add); + +extern QueryInstrumentation *InstrQueryAlloc(int instrument_options); +extern void InstrQueryStart(QueryInstrumentation *instr); +extern void InstrQueryStop(QueryInstrumentation *instr); +extern QueryInstrumentation *InstrQueryStopFinalize(QueryInstrumentation *instr); +extern void InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *instr); + +pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage); +extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern NodeInstrumentation *InstrAllocNode(int instrument_options, bool async_mode); extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); extern void InstrStartNode(NodeInstrumentation *instr); extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent); extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); @@ -145,31 +292,31 @@ extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); extern void InstrStartTrigger(TriggerInstrumentation *tginstr); extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); +extern void WalUsageAdd(WalUsage *dst, const WalUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); #define INSTR_BUFUSAGE_INCR(fld) do { \ - pgBufferUsage.fld++; \ + instr_stack.current->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ - pgBufferUsage.fld += val; \ + instr_stack.current->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ - INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + instr_stack.current->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + instr_stack.current->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb..5463bc921f0 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7dc0073ab68..5580a080210 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1320,6 +1320,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStackState Instrumentation Int128AggState Int8TransTypeData @@ -2428,6 +2429,7 @@ QueryCompletion QueryDesc QueryEnvironment QueryInfo +QueryInstrumentation QueryItem QueryItemType QueryMode -- 2.47.1