From f47ac425817f6b59f3b68384e7c8640070241bab Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Mon, 12 Jan 2026 18:23:03 -0800 Subject: [PATCH v6 7/8] Index scans: Show table buffer accesses separately in EXPLAIN ANALYZE This sets up a separate instrumentation stack that is used whilst an Index Scan does scanning on the table, for example due to additional data being needed. EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity. The activity is also included in regular "Buffers" together with index activity and that of any child nodes. Author: Lukas Fittl Suggested-by: Andres Freund Reviewed-by: Discussion: --- doc/src/sgml/perform.sgml | 13 ++++-- doc/src/sgml/ref/explain.sgml | 1 + src/backend/commands/explain.c | 55 +++++++++++++++----------- src/backend/executor/execProcnode.c | 28 +++++++++++++ src/backend/executor/instrument.c | 53 +++++++++++++++++++++++++ src/backend/executor/nodeIndexscan.c | 23 ++++++++++- src/include/executor/instrument.h | 5 +++ src/include/executor/instrument_node.h | 5 +++ src/include/nodes/execnodes.h | 7 ++++ 9 files changed, 162 insertions(+), 28 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467..9219625faf6 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366..912c96f2ff5 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b35aea246cf..054c5bce837 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1962,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTableStack->bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2280,7 +2283,7 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->stack.bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage, NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->stack.walusage); @@ -2299,7 +2302,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->stack.bufusage); + show_buffer_usage(es, &instrument->stack.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); @@ -4099,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4124,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4179,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4220,44 +4227,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index e2e21c66dc9..c3ff03f805b 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -415,9 +415,19 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) + { result->instrument = InstrAllocNode(estate->es_instrument, result->async_capable); + /* IndexScan tracks table access separately from index access. */ + if (IsA(result, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, result); + + iss->iss_InstrumentTableStack = InstrAllocAdditionalNodeStack(result->instrument); + } + } + return result; } @@ -859,6 +869,15 @@ ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) node->instrument->need_walusage)) { InstrRememberNodeStack(parent, &node->instrument->stack); + + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + InstrRememberNodeStack(parent, iss->iss_InstrumentTableStack); + } } return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); @@ -902,6 +921,15 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) if (!node->instrument) return false; + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + iss->iss_InstrumentTableStack = InstrFinalizeAdditionalNodeStack(iss->iss_InstrumentTableStack, node->instrument); + } + node->instrument = InstrFinalizeNode(node->instrument, parent); return false; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index fc7ac85434b..0c2d769416e 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -506,6 +506,59 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } +/* + * Allocate an additional InstrStack for a node, e.g. for tracking table + * buffer usage separately from index buffer usage. Allocated in + * TopTransactionContext so it survives long enough for abort recovery. + */ +InstrStack * +InstrAllocAdditionalNodeStack(NodeInstrumentation *instr) +{ + if (instr->need_bufusage || instr->need_walusage) + return MemoryContextAllocZero(TopTransactionContext, sizeof(InstrStack)); + + return NULL; +} + +void +InstrStartNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure the executor set up a parent node below the top level stack */ + Assert(CurrentInstrStack != &TopInstrStack); + + InstrPushStack(stack); + } +} + +void +InstrStopNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Adding to parent is handled by InstrFinalizeAdditionalNodeStack */ + InstrPopStack(stack); + } +} + +/* Add additional node stacks to the parent and move into per-query memory context */ +InstrStack * +InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr) +{ + InstrStack *dst = palloc(sizeof(InstrStack)); + + memcpy(dst, stack, sizeof(InstrStack)); + pfree(stack); + + /* Avoid stale pointer references */ + dst->previous = NULL; + + InstrStackAdd(&instr->stack, dst); + + return dst; +} + /* start instrumentation during parallel executor startup */ Instrumentation * InstrStartParallelQuery(void) diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index a616abff04c..fb62ff7a8be 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,6 +83,7 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; /* @@ -128,8 +129,20 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->ss.ps.instrument) + InstrStartNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (unlikely(!index_fetch_heap(scandesc, slot))) + continue; + + if (node->ss.ps.instrument) + InstrStopNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* @@ -812,6 +825,8 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTableStack) + InstrStackAdd(&winstrument->worker_table_stack, node->iss_InstrumentTableStack); } /* @@ -1819,4 +1834,10 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's stack */ + if (node->iss_InstrumentTableStack) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + InstrStackAdd(node->iss_InstrumentTableStack, + &node->iss_SharedInfo->winstrument[i].worker_table_stack); } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 7a5d0597cd5..8fb6c911fcb 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -239,6 +239,11 @@ extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); +extern InstrStack *InstrAllocAdditionalNodeStack(NodeInstrumentation *instr); +extern void InstrStartNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern void InstrStopNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern InstrStack *InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr); + pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa..dd45d72559d 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing iss_InstrumentTableStack data from parallel workers */ + InstrStack worker_table_stack; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 605c7a6cc39..6a6427afbd2 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1728,6 +1728,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation stack utilized for tracking table access. This is + * separate from iss_Instrument since it needs to be allocated in the + * right context and IndexScanInstrumentation shouldn't contain pointers. + */ + InstrStack *iss_InstrumentTableStack; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; -- 2.47.1