From 77161a6f2c0eb78189e8595c0f276c5fe4fcfa2a Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH v6 5/8] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level Instrumentation struct, and its associated InstrStart/InstrStop helpers (which use resource owners to handle aborts), or dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. Author: Lukas Fittl Reviewed-by: Discussion: --- .../pg_stat_statements/pg_stat_statements.c | 107 +++--- src/backend/access/brin/brin.c | 6 +- src/backend/access/gin/gininsert.c | 6 +- src/backend/access/nbtree/nbtsort.c | 6 +- src/backend/commands/explain.c | 8 +- src/backend/commands/vacuumparallel.c | 6 +- src/backend/executor/execMain.c | 60 ++- src/backend/executor/execParallel.c | 6 +- src/backend/executor/execProcnode.c | 79 ++++ src/backend/executor/instrument.c | 346 +++++++++++++++--- src/include/executor/executor.h | 2 + src/include/executor/instrument.h | 116 +++++- src/include/utils/resowner.h | 1 + src/tools/pgindent/typedefs.list | 1 + 14 files changed, 621 insertions(+), 129 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 388b068ccec..fe3237fc665 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -911,21 +911,13 @@ pgss_planner(Query *parse, { instr_time start; instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + /* We need to track buffer/WAL usage as the planner can access them. */ + InstrPushStack(&stack); + nesting_level++; PG_TRY(); { @@ -938,6 +930,7 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -945,14 +938,6 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -960,8 +945,8 @@ pgss_planner(Query *parse, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), 0, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1082,6 +1067,13 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { + /* + * Check if stack is initialized - it is not when ExecutorRun wasn't + * called + */ + const BufferUsage *bufusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL; + const WalUsage *walusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL; + pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, @@ -1089,8 +1081,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + bufusage, + walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1157,14 +1149,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, instr_time start; instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + InstrPushStack(&stack); nesting_level++; PG_TRY(); @@ -1180,6 +1168,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -1208,14 +1197,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, @@ -1223,8 +1204,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), rows, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1454,27 +1435,33 @@ pgss_store(const char *query, int64 queryId, } } entry->counters.rows += rows; - entry->counters.shared_blks_hit += bufusage->shared_blks_hit; - entry->counters.shared_blks_read += bufusage->shared_blks_read; - entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; - entry->counters.shared_blks_written += bufusage->shared_blks_written; - entry->counters.local_blks_hit += bufusage->local_blks_hit; - entry->counters.local_blks_read += bufusage->local_blks_read; - entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; - entry->counters.local_blks_written += bufusage->local_blks_written; - entry->counters.temp_blks_read += bufusage->temp_blks_read; - entry->counters.temp_blks_written += bufusage->temp_blks_written; - entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); - entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); - entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); - entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); - entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); - entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + if (bufusage) + { + entry->counters.shared_blks_hit += bufusage->shared_blks_hit; + entry->counters.shared_blks_read += bufusage->shared_blks_read; + entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; + entry->counters.shared_blks_written += bufusage->shared_blks_written; + entry->counters.local_blks_hit += bufusage->local_blks_hit; + entry->counters.local_blks_read += bufusage->local_blks_read; + entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; + entry->counters.local_blks_written += bufusage->local_blks_written; + entry->counters.temp_blks_read += bufusage->temp_blks_read; + entry->counters.temp_blks_written += bufusage->temp_blks_written; + entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); + entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); + entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); + entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); + entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); + entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + } entry->counters.usage += USAGE_EXEC(total_time); - entry->counters.wal_records += walusage->wal_records; - entry->counters.wal_fpi += walusage->wal_fpi; - entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; + if (walusage) + { + entry->counters.wal_records += walusage->wal_records; + entry->counters.wal_fpi += walusage->wal_fpi; + entry->counters.wal_bytes += walusage->wal_bytes; + entry->counters.wal_buffers_full += walusage->wal_buffers_full; + } if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6887e421442..c1c3d03b6ed 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2885,6 +2885,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2934,7 +2935,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2949,7 +2950,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 0d63fb4ba27..9149d735d59 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2108,6 +2108,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2176,7 +2177,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2191,7 +2192,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3a45508f62e..600f23ff2a6 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1750,6 +1750,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1825,7 +1826,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1835,7 +1836,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index d7f33644d79..17405aa8621 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2288,9 +2288,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2307,9 +2307,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index c3b3c9ea21a..10ba717bb6b 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -994,6 +994,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1083,7 +1084,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1091,7 +1092,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index c0b174cfbc0..f01f1c864c4 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -329,10 +330,27 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(estate->es_instrument); + if (queryDesc->totaltime) + { + /* Allow instrumentation of Executor overall runtime */ InstrStart(queryDesc->totaltime); + /* + * Remember all node stacks for abort recovery. We do this once here + * after the first call to InstrStart has allocated the parent stack. + */ + if (queryDesc->totaltime->stack && !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime->stack); + } + /* * extract information from the query descriptor and the query feature. */ @@ -383,7 +401,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, false); MemoryContextSwitchTo(oldcontext); } @@ -442,8 +460,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, true); MemoryContextSwitchTo(oldcontext); @@ -1484,6 +1520,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti && ti->instr.stack) + InstrStackAdd(CurrentInstrStack, ti->instr.stack); + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index fb9a4afbbf1..dee8bf99c93 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -1455,6 +1455,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1515,7 +1516,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1531,7 +1532,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 1846661b503..e2e21c66dc9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,8 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -828,6 +830,83 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation stacks as unfinalized children of the + * executor's instrumentation stack. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node instrumentation stack to + * recover buffer/WAL data from nodes that never got finalized, that someone + * might be interested in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument && (node->instrument->need_bufusage || + node->instrument->need_walusage)) + { + InstrRememberNodeStack(parent, &node->instrument->stack); + } + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, (InstrStack *) CurrentInstrStack); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our stack + * as the parent context. This ensures children can accumulate to us even + * if they were never executed by the leader (e.g. nodes beneath Gather + * that only workers ran, where stack.previous would not be initialized). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->stack : parent); + + if (!node->instrument) + return false; + + node->instrument = InstrFinalizeNode(node->instrument, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 34daadc88fc..1223a59879f 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,15 +16,139 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +InstrStack TopInstrStack; +InstrStack *CurrentInstrStack = &TopInstrStack; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +static void InstrFinalizeNodesOnAbort(InstrStack *stack); + +/* + * Use ResourceOwner mechanism to correctly reset CurrentInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static bool +StackIsParent(InstrStack *stack, InstrStack *entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +/* + * OrphanSkippedStacks + * + * Orphans all stack entries from the current stack entry to the provided + * stack which is assumed to be a parent stack of the current stack, and + * terminates once the current stack entry has reached the provided stack. + * + * This sets previous pointers of intermediate stack entries to NULL, so we + * don't have to worry about calling StackIsParent with a bad pointer in + * ResOwnerReleaseInstrumentation. + * + * This matters because we may process stack entries out of order in aborts + * because (1) we might have a mix of ResOwner and PG_FINALLY owned stacks + * (2) ResOwnerReleaseInstrumentation might be called out of order. + */ +static void +OrphanSkippedStacks(InstrStack *stack) +{ + if (CurrentInstrStack == stack || !StackIsParent(stack, CurrentInstrStack)) + return; + + while (CurrentInstrStack != stack) + { + InstrStack *previous = CurrentInstrStack->previous; + + Assert(previous != NULL); + CurrentInstrStack->previous = NULL; + CurrentInstrStack = previous; + } +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + InstrStack *stack = (InstrStack *) DatumGetPointer(res); + + /* + * Because registered resources are *not* cleaned up in a guaranteed + * order, we may get a child context after we've processed the parent. + * Thus, we only pop the stack if its not already a parent of the stack + * being released. Note that OrphanSkippedStacks may have set our previous + * stack entry to NULL, in which case we don't modify the stack either. + * + * Note that StackIsParent will recurse as needed, so it is inadvisible to + * use deeply nested stacks. + */ + if (stack->previous && !StackIsParent(CurrentInstrStack, stack)) + { + OrphanSkippedStacks(stack); + InstrPopStack(stack); + } + + /* Accumulate data from all unfinalized child node stacks. */ + InstrFinalizeNodesOnAbort(stack); + + /* + * Accumulate the stack associated with the ResOwner to the active stack. + * + * Note that we intentionally directly add to the current stack instead of + * the parent of the stack being released, because this can execute out of + * order. Explicit PG_FINALLY blocks might have modified the stack as + * well. + */ + InstrStackAdd(CurrentInstrStack, stack); + + /* Free the stack entry now since InstrStop won't be called */ + pfree(stack); +} + +/* + * Pops the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not + * CurrentInstrStack, as can happen with PG_FINALLY, and orphans any + * intermediate stacks that were skipped. + */ +void +InstrPopAndFinalizeStack(InstrStack *stack) +{ + OrphanSkippedStacks(stack); + InstrPopStack(stack); + InstrStackAdd(CurrentInstrStack, stack); +} /* General purpose instrumentation handling */ Instrumentation * @@ -51,15 +175,31 @@ InstrStart(Instrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + Assert(CurrentResourceOwner != NULL); + instr->owner = CurrentResourceOwner; + + /* + * Allocate the stack resource in a memory context that survives + * during an abort. This will be freed by InstrStop (regular + * execution) or ResOwnerReleaseInstrumentation (abort). + * + * We don't do this in InstrAlloc to avoid allocating when InstrStart + * + InstrStop isn't called. + */ + if (instr->stack == NULL) + instr->stack = MemoryContextAllocZero(TopMemoryContext, sizeof(InstrStack)); + + ResourceOwnerEnlarge(instr->owner); + ResourceOwnerRememberInstrStack(instr->owner, instr->stack); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(instr->stack); + } } void -InstrStop(Instrumentation *instr) +InstrStop(Instrumentation *instr, bool finalize) { instr_time endtime; @@ -75,14 +215,31 @@ InstrStop(Instrumentation *instr) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + InstrPopStack(instr->stack); + + if (finalize) + InstrStackAdd(CurrentInstrStack, instr->stack); + + Assert(instr->owner != NULL); + ResourceOwnerForgetInstrStack(instr->owner, instr->stack); + instr->owner = NULL; + + if (finalize) + { + /* + * To avoid keeping memory allocated beyond when its needed, copy + * the result to the current memory context, and free it in the + * transaction context. + */ + InstrStack *stack = palloc(sizeof(InstrStack)); + + memcpy(stack, instr->stack, sizeof(InstrStack)); + pfree(instr->stack); + instr->stack = stack; + } + } } /* Trigger instrumentation handling */ @@ -90,16 +247,16 @@ TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options) { TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; int i; for (i = 0; i < n; i++) { + tginstr[i].instr.need_timer = need_timer; tginstr[i].instr.need_bufusage = need_buffers; tginstr[i].instr.need_walusage = need_wal; - tginstr[i].instr.need_timer = need_timer; } return tginstr; @@ -114,7 +271,12 @@ InstrStartTrigger(TriggerInstrumentation *tginstr) void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) { - InstrStop(&tginstr->instr); + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ + InstrStop(&tginstr->instr, false); tginstr->firings += firings; } @@ -124,7 +286,13 @@ InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) NodeInstrumentation * InstrAllocNode(int instrument_options, bool async_mode) { - NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation)); + /* + * We can utilize TopTransactionContext instead of TopMemoryContext here + * (despite the inlined InstrStack in NodeInstrumentation) because nodes + * don't get used for utility commands that restart transactions, which + * would require a context that survives longer (EXPLAIN ANALYZE is fine). + */ + NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation)); InstrInitNode(instr, instrument_options); instr->async_mode = async_mode; @@ -142,6 +310,30 @@ InstrInitNode(NodeInstrumentation *instr, int instrument_options) instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } +/* + * InstrRememberNodeStack - register a child node stack for abort processing. + * + * On abort, InstrFinalizeNodesOnAbort will walk the parent's list to recover + * buffer/WAL data from stacks that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + * + * The passed in node stack can either be the NodeInstrumentation stack or an + * additional stack that is associated with a node. This must not be called + * with other (non-node) instrumentation stacks as the child that perform their + * own cleanup. The parent must be a non-node stack that can handle aborts. + */ +void +InstrRememberNodeStack(InstrStack *parent, InstrStack *node_stack) +{ + /* + * We do not support nesting, to avoid recursion in + * InstrFinalizeNodesOnAbort + */ + Assert(parent->unfinalized_node.next == NULL); + + slist_push_head(&parent->unfinalized_children, &node_stack->unfinalized_node); +} + /* Entry to a plan node */ void InstrStartNode(NodeInstrumentation *instr) @@ -150,12 +342,13 @@ InstrStartNode(NodeInstrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(CurrentInstrStack != &TopInstrStack); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(&instr->stack); + } } /* Exit from a plan node */ @@ -180,14 +373,14 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Adding to parent is handled by ExecFinalizeNodeInstrumentation */ + InstrPopStack(&instr->stack); + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -206,6 +399,50 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) } } +/* Add per-node instrumentation to the parent and move into per-query memory context */ +NodeInstrumentation * +InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent) +{ + NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation)); + + memcpy(dst, instr, sizeof(NodeInstrumentation)); + pfree(instr); + + /* Avoid stale pointer references */ + dst->stack.previous = NULL; + + InstrStackAdd(parent, &dst->stack); + + return dst; +} + +/* + * InstrFinalizeNodesOnAbort + * + * Accumulates unfinalized child per-node stacks into the resource owner stack, + * and resets the list so a theoretical second call is a safe no-op. + */ +static void +InstrFinalizeNodesOnAbort(InstrStack *stack) +{ + slist_iter iter; + + slist_foreach(iter, &stack->unfinalized_children) + { + InstrStack *child = slist_container(InstrStack, unfinalized_node, iter.cur); + + InstrStackAdd(stack, child); + + /* + * Note we don't free the child here since its usually contained + * within NodeInstrumentation and we don't have an easy way to access + * that, it will be instead be cleaned up by the transaction ending. + */ + } + + slist_init(&stack->unfinalized_children); +} + /* Update tuple count */ void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) @@ -265,38 +502,65 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } -/* note current values during parallel executor startup */ -void +/* start instrumentation during parallel executor startup */ +Instrumentation * InstrStartParallelQuery(void) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + Instrumentation *instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrStart(instr); + return instr; } /* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage) { + InstrStop(instr, true); memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); + memcpy(bufusage, &instr->stack->bufusage, sizeof(BufferUsage)); memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + memcpy(walusage, &instr->stack->walusage, sizeof(WalUsage)); } -/* accumulate work done by workers in leader's stats */ +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If its active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { + BufferUsageAdd(&CurrentInstrStack->bufusage, bufusage); + WalUsageAdd(&CurrentInstrStack->walusage, walusage); + BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } +void +InstrStackAdd(InstrStack *dst, InstrStack *add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); +} + /* dst += add */ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index d46ba59895d..4406be9ed7b 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 16688b5fdb4..7b3e9a21733 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -67,12 +68,33 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack entry for current WAL/buffer usage */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; + + /* + * Tracking of instr stacks that need to be cleaned up on abort that are + * not registered as a resource owner themselves + */ + slist_head unfinalized_children; /* head of unfinalized children list */ + slist_node unfinalized_node; /* node in parent's unfinalized list */ +} InstrStack; + /* * General purpose instrumentation that can capture time and WAL/buffer usage * * Initialized through InstrAlloc, followed by one or more calls to a pair of * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of top level transaction between InstrStart/InstrStop + * calls in regular execution. If this is needed, directly use InstrPushStack / + * InstrPopStack in a PG_TRY/PG_FINALLY block instead. */ +struct ResourceOwnerData; typedef struct Instrumentation { /* Parameters set at creation: */ @@ -81,12 +103,10 @@ typedef struct Instrumentation bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack *stack; /* stack tracking buffer/WAL usage */ + struct ResourceOwnerData *owner; } Instrumentation; /* Trigger instrumentation */ @@ -99,6 +119,11 @@ typedef struct TriggerInstrumentation /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for + * WAL/buffer usage statistics, and relies on it for managing aborts. Solely + * intended for the executor and anyone reporting about its activities (e.g. + * EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -113,8 +138,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -123,10 +146,13 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ @@ -136,9 +162,68 @@ typedef struct WorkerNodeInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +/* + * The top instrumentation stack represents a running total of the current + * backend WAL/buffer usage information. This will not be updated immediately, + * but rather when the current stack entry gets accumulated which typically + * happens at query end (see CurrentInstrStack below). + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT InstrStack TopInstrStack; + +/* + * The currently active stack entry that is getting updated as activity + * happens, and will be accumulated to parent stacks when it gets finalized + * by InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStack *CurrentInstrStack; + +extern void InstrStackAdd(InstrStack *dst, InstrStack *add); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * stack entry. + * + * Any caller using this directly must manage the passed in stack and call + * InstrPopStack on its own again, typically by using a PG_FINALLY block to + * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart + * instead when you want automatic handling of abort cases using the resource + * owner infrastructure. + */ +static inline void +InstrPushStack(InstrStack *stack) +{ + stack->previous = CurrentInstrStack; + CurrentInstrStack = stack; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, instead call InstrPopAndFinalizeStack which can skip intermediate + * stack entries, or instead use InstrStart/InstrStop. + */ +static inline void +InstrPopStack(InstrStack *stack) +{ + Assert(stack != NULL); + CurrentInstrStack = stack->previous; +} + +extern void InstrPopAndFinalizeStack(InstrStack *stack); + extern Instrumentation *InstrAlloc(int instrument_options); extern void InstrStart(Instrumentation *instr); -extern void InstrStop(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr, bool finalize); extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); extern void InstrStartTrigger(TriggerInstrumentation *tginstr); @@ -147,14 +232,16 @@ extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); extern NodeInstrumentation *InstrAllocNode(int instrument_options, bool async_mode); extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrRememberNodeStack(InstrStack *parent, InstrStack *instr); extern void InstrStartNode(NodeInstrumentation *instr); extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent); extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); +pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub); @@ -163,21 +250,28 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + CurrentInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + CurrentInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + INSTR_TIME_ADD(CurrentInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + INSTR_TIME_ACCUM_DIFF(CurrentInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + CurrentInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + CurrentInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb..5463bc921f0 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e1075c709f5..b3146b0a165 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1319,6 +1319,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStack Instrumentation Int128AggState Int8TransTypeData -- 2.47.1