From 4375fcb4141f18d6cd927659970518553aa3fe94 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 31 Aug 2025 16:37:05 -0700 Subject: [PATCH v1 3/3] Introduce stack for tracking per-node WAL/buffer usage --- src/backend/commands/explain.c | 8 +- src/backend/executor/execMain.c | 7 ++ src/backend/executor/execProcnode.c | 9 +++ src/backend/executor/instrument.c | 111 ++++++++++++++++++++++++---- src/include/executor/instrument.h | 42 ++++++++++- 5 files changed, 155 insertions(+), 22 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 6a135e51996..584f0adbcc1 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2280,9 +2280,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2299,9 +2299,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index b83ced9a57a..1c2268bc608 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -312,6 +312,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, DestReceiver *dest; bool sendTuples; MemoryContext oldcontext; + InstrStackResource *res; /* sanity checks */ Assert(queryDesc != NULL); @@ -333,6 +334,9 @@ standard_ExecutorRun(QueryDesc *queryDesc, if (queryDesc->totaltime) InstrStart(queryDesc->totaltime); + /* Start up per-query node level instrumentation */ + res = InstrStartQuery(); + /* * extract information from the query descriptor and the query feature. */ @@ -382,6 +386,9 @@ standard_ExecutorRun(QueryDesc *queryDesc, if (sendTuples) dest->rShutdown(dest); + /* Shut down per-query node level instrumentation */ + InstrShutdownQuery(res); + if (queryDesc->totaltime) InstrStop(queryDesc->totaltime, estate->es_processed); diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d286471254b..7436f307994 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -823,8 +823,17 @@ ExecShutdownNode_walker(PlanState *node, void *context) /* Stop the node if we started it above, reporting 0 tuples. */ if (node->instrument && node->instrument->running) + { InstrStopNode(node->instrument, 0); + /* + * Propagate WAL/buffer stats to the parent node on the + * instrumentation stack (which is where InstrStopNode returned us + * to). + */ + InstrNodeAddToCurrent(&node->instrument->stack); + } + return false; } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index c53480d8030..040d1fdecbd 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,15 +16,40 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; static WalUsage save_pgWalUsage; +InstrStack *pgInstrStack = NULL; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +/* + * Node-specific instrumentation handling uses ResourceOwner mechanism to + * reset pgInstrStack on abort. + */ +static void ResOwnerReleaseInstrStack(Datum res); +static const ResourceOwnerDesc instr_stack_resowner_desc = +{ + .name = "instrumentation stack scope", + .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, + .release_priority = RELEASE_PRIO_FIRST, + .ReleaseResource = ResOwnerReleaseInstrStack, + .DebugPrint = NULL, /* default message is fine */ +}; +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStackResource * scope) +{ + ResourceOwnerRemember(owner, PointerGetDatum(scope), &instr_stack_resowner_desc); +} +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStackResource * scope) +{ + ResourceOwnerForget(owner, PointerGetDatum(scope), &instr_stack_resowner_desc); +} /* General purpose instrumentation handling */ Instrumentation * @@ -139,12 +164,17 @@ InstrStartNode(NodeInstrumentation * instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* + * Ensure that we have an active pgInstrStack (InstrStartQuery must + * have been called) + */ + Assert(pgInstrStack != NULL); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + instr->stack.previous = pgInstrStack; + pgInstrStack = &instr->stack; + } } /* Exit from a plan node */ @@ -169,14 +199,12 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that there is a stack entry above the top-most node */ + Assert(instr->stack.previous != NULL); + pgInstrStack = instr->stack.previous; + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -257,10 +285,65 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); +} + +InstrStackResource * +InstrStartQuery() +{ + InstrStack *usage = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack)); + InstrStackResource *usageRes = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStackResource)); + ResourceOwner owner = CurrentResourceOwner; + + Assert(owner != NULL); + + usageRes->owner = owner; + + ResourceOwnerEnlarge(owner); + ResourceOwnerRememberInstrStack(owner, usageRes); + + usage->previous = pgInstrStack; + pgInstrStack = usage; + + return usageRes; +} + +void +InstrShutdownQuery(InstrStackResource * res) +{ + Assert(res != NULL); + Assert(res->owner != NULL); + + pgInstrStack = res->previous; + + ResourceOwnerForgetInstrStack(res->owner, res); +} + +static void +ResOwnerReleaseInstrStack(Datum res) +{ + /* + * XXX: Registered resources are *not* called in reverse order, i.e. we'll + * get what was first registered first at shutdown. To avoid handling + * that, we are resetting the stack here on abort (instead of recovering + * to previous). + */ + pgInstrStack = NULL; +} + +void +InstrNodeAddToCurrent(InstrStack * add) +{ + if (pgInstrStack != NULL) + { + InstrStack *dst = pgInstrStack; + + BufferUsageAdd(&dst->bufusage, &add->bufusage); WalUsageAdd(&dst->walusage, &add->walusage); + } } /* note current values during parallel executor startup */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 3a280f4caae..a98efab5f93 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -14,6 +14,7 @@ #define INSTRUMENT_H #include "portability/instr_time.h" +#include "utils/resowner.h" /* @@ -66,6 +67,21 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack of WAL/buffer usage used for per-node instrumentation */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; +} InstrStack; + +/* Used to manage resetting of instrumentation stack on abort. */ +typedef struct InstrStackResource +{ + InstrStack *previous; + ResourceOwner owner; +} InstrStackResource; + /* * General purpose instrumentation that can capture time, WAL/buffer usage and tuples * @@ -91,6 +107,10 @@ typedef struct Instrumentation /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of InstrStartQuery to initialize the stack used for WAL/buffer + * usage statistics, and cleanup through InstrShutdownQuery. Solely intended for + * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -105,8 +125,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ double firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ double startup; /* total startup time (in seconds) */ double total; /* total time (in seconds) */ @@ -115,8 +133,7 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; typedef struct WorkerInstrumentation @@ -127,6 +144,7 @@ typedef struct WorkerInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +extern PGDLLIMPORT InstrStack * pgInstrStack; extern Instrumentation *InstrAlloc(int n, int instrument_options); extern void InstrStart(Instrumentation *instr); @@ -135,11 +153,14 @@ extern void InstrStop(Instrumentation *instr, double nTuples); extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options, bool async_mode); extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options); +extern InstrStackResource * InstrStartQuery(void); +extern void InstrShutdownQuery(InstrStackResource * res); extern void InstrStartNode(NodeInstrumentation * instr); extern void InstrStopNode(NodeInstrumentation * instr, double nTuples); extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation * instr); extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add); +extern void InstrNodeAddToCurrent(InstrStack * stack); extern void InstrStartParallelQuery(void); extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); @@ -151,21 +172,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + if (pgInstrStack) \ + INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + if (pgInstrStack) \ + INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + if (pgInstrStack) \ + pgInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ -- 2.47.1