From e5ac9b5b15157dc22fa0fe430c4a2426b6582d8a Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Mon, 12 Jan 2026 18:23:03 -0800 Subject: [PATCH v5 6/6] Index scans: Split heap and index buffer access reporting in EXPLAIN This makes it clear whether activity was on the index directly, or on the table based on heap fetches. --- src/backend/commands/explain.c | 56 +++++++++++++++----------- src/backend/executor/execProcnode.c | 11 +++++ src/backend/executor/instrument.c | 25 ++++++++++++ src/backend/executor/nodeIndexscan.c | 15 ++++++- src/include/executor/instrument.h | 3 ++ src/include/executor/instrument_node.h | 4 ++ 6 files changed, 90 insertions(+), 24 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index dee5344e6d3..21568fbd534 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1962,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument.table_stack.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2280,7 +2283,8 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->stack.bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage, + IsA(plan, IndexScan) ? "Index" : NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->stack.walusage); @@ -2299,7 +2303,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->stack.bufusage); + show_buffer_usage(es, &instrument->stack.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); @@ -4099,7 +4103,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4124,6 +4128,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4179,6 +4185,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4220,44 +4228,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index a5afe83237d..c727b7f2454 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -856,6 +856,17 @@ ExecAccumNodeInstrumentation_walker(PlanState *node, void *context) if (!node->instrument || !node->instrument->stack.previous) return false; + /* + * Index Scan nodes account for table buffer usage separately, so we need + * to explitly add it here. + */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + InstrStackAdd(node->instrument->stack.previous, &iss->iss_Instrument.table_stack); + } + InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack); return false; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 26e0ae7726c..2f145d67dff 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -375,6 +375,31 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } +void +InstrStartNodeStack(NodeInstrumentation *instr, InstrStack * stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(pgInstrStack != NULL); + + InstrPushStack(stack); + } +} + +void +InstrStopNodeStack(NodeInstrumentation *instr, InstrStack * stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(stack->previous != NULL); + + /* Adding to parent is handled by ExecAccumNodeInstrumentation */ + InstrPopStack(stack, false); + } +} + /* start instrumentation during parallel executor startup */ Instrumentation * InstrStartParallelQuery(void) diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 84823f0b615..0fc197f87da 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,6 +83,7 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; /* @@ -128,8 +129,20 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->ss.ps.instrument) + InstrStartNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack); + + if (unlikely(!index_fetch_heap(scandesc, slot))) + continue; + + if (node->ss.ps.instrument) + InstrStopNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack); + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 145217b0df8..d1be3db9749 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -185,6 +185,9 @@ extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); +extern void InstrStartNodeStack(NodeInstrumentation *dst, InstrStack * stack); +extern void InstrStopNodeStack(NodeInstrumentation *dst, InstrStack * stack); + pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa..25554c43942 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,8 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + /* Buffer usage of heap access during index scans */ + InstrStack table_stack; } IndexScanInstrumentation; /* -- 2.47.1