From 3f2cd3628db514c59912eadf51082f86538bb6e9 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Tue, 26 Aug 2025 13:08:13 +0000 Subject: [PATCH v6] Add mem_exceeded_count column to pg_stat_replication_slots. This commit introduces a new column mem_exceeded_count to the pg_stat_replication_slots view. This counter tracks how often the memory used by logical decoding exceeds the logical_decoding_work_mem limit. The new statistic helps users determine whether exceeding the logical_decoding_work_mem limit is a rare occurrences or a frequent issue, information that wasn't available through existing statistics. Bumps catversion. Author: Bertrand Drouvot Reviewed-by: Masahiko Sawada Reviewed-by: Amit Kapila Reviewed-by: shveta malik Reviewed-by: Chao Li Discussion: https://postgr.es/m/978D21E8-9D3B-40EA-A4B1-F87BABE7868C@yesql.se --- contrib/test_decoding/expected/stats.out | 68 +++++++++---------- contrib/test_decoding/sql/stats.sql | 10 +-- doc/src/sgml/monitoring.sgml | 11 +++ src/backend/catalog/system_views.sql | 1 + src/backend/replication/logical/logical.c | 8 ++- .../replication/logical/reorderbuffer.c | 7 +- src/backend/utils/activity/pgstat_replslot.c | 1 + src/backend/utils/adt/pgstatfuncs.c | 19 +++--- src/include/catalog/pg_proc.dat | 6 +- src/include/pgstat.h | 1 + src/include/replication/reorderbuffer.h | 3 + src/test/regress/expected/rules.out | 3 +- 12 files changed, 84 insertions(+), 54 deletions(-) diff --git a/contrib/test_decoding/expected/stats.out b/contrib/test_decoding/expected/stats.out index de6dc416130..72fbb270334 100644 --- a/contrib/test_decoding/expected/stats.out +++ b/contrib/test_decoding/expected/stats.out @@ -37,12 +37,12 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | t | t - regression_slot_stats2 | t | t | t | t - regression_slot_stats3 | t | t | t | t +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | t | t | t + regression_slot_stats2 | t | t | t | t | t + regression_slot_stats3 | t | t | t | t | t (3 rows) RESET logical_decoding_work_mem; @@ -53,12 +53,12 @@ SELECT pg_stat_reset_replication_slot('regression_slot_stats1'); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | f | f - regression_slot_stats2 | t | t | t | t - regression_slot_stats3 | t | t | t | t +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | f | f | t + regression_slot_stats2 | t | t | t | t | t + regression_slot_stats3 | t | t | t | t | t (3 rows) -- reset stats for all slots @@ -68,27 +68,27 @@ SELECT pg_stat_reset_replication_slot(NULL); (1 row) -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; - slot_name | spill_txns | spill_count | total_txns | total_bytes -------------------------+------------+-------------+------------+------------- - regression_slot_stats1 | t | t | f | f - regression_slot_stats2 | t | t | f | f - regression_slot_stats3 | t | t | f | f +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; + slot_name | spill_txns | spill_count | total_txns | total_bytes | mem_exceeded_count +------------------------+------------+-------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | f | f | t + regression_slot_stats2 | t | t | f | f | t + regression_slot_stats3 | t | t | f | f | t (3 rows) -- verify accessing/resetting stats for non-existent slot does something reasonable SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); - slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | total_txns | total_bytes | stats_reset ---------------+------------+-------------+-------------+-------------+--------------+--------------+------------+-------------+------------- - do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | mem_exceeded_count | total_txns | total_bytes | stats_reset +--------------+------------+-------------+-------------+-------------+--------------+--------------+--------------------+------------+-------------+------------- + do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (1 row) SELECT pg_stat_reset_replication_slot('do-not-exist'); ERROR: replication slot "do-not-exist" does not exist SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); - slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | total_txns | total_bytes | stats_reset ---------------+------------+-------------+-------------+-------------+--------------+--------------+------------+-------------+------------- - do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + slot_name | spill_txns | spill_count | spill_bytes | stream_txns | stream_count | stream_bytes | mem_exceeded_count | total_txns | total_bytes | stats_reset +--------------+------------+-------------+-------------+-------------+--------------+--------------+--------------------+------------+-------------+------------- + do-not-exist | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (1 row) -- spilling the xact @@ -110,12 +110,12 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count FROM pg_stat_replication_slots; - slot_name | spill_txns | spill_count -------------------------+------------+------------- - regression_slot_stats1 | t | t - regression_slot_stats2 | f | f - regression_slot_stats3 | f | f +SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count, mem_exceeded_count > 0 AS mem_exceeded_count FROM pg_stat_replication_slots; + slot_name | spill_txns | spill_count | mem_exceeded_count +------------------------+------------+-------------+-------------------- + regression_slot_stats1 | t | t | t + regression_slot_stats2 | f | f | f + regression_slot_stats3 | f | f | f (3 rows) -- Ensure stats can be repeatedly accessed using the same stats snapshot. See @@ -165,10 +165,10 @@ SELECT pg_stat_force_next_flush(); (1 row) -SELECT slot_name, spill_txns, spill_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; - slot_name | spill_txns | spill_count ----------------------------------+------------+------------- - regression_slot_stats4_twophase | 0 | 0 +SELECT slot_name, spill_txns, spill_count, mem_exceeded_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; + slot_name | spill_txns | spill_count | mem_exceeded_count +---------------------------------+------------+-------------+-------------------- + regression_slot_stats4_twophase | 0 | 0 | 1 (1 row) DROP TABLE stats_test; diff --git a/contrib/test_decoding/sql/stats.sql b/contrib/test_decoding/sql/stats.sql index a022fe1bf07..9964a8efb87 100644 --- a/contrib/test_decoding/sql/stats.sql +++ b/contrib/test_decoding/sql/stats.sql @@ -15,16 +15,16 @@ SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats1', NULL, SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats2', NULL, NULL, 'skip-empty-xacts', '1'); SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats3', NULL, NULL, 'skip-empty-xacts', '1'); SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; RESET logical_decoding_work_mem; -- reset stats for one slot, others should be unaffected SELECT pg_stat_reset_replication_slot('regression_slot_stats1'); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; -- reset stats for all slots SELECT pg_stat_reset_replication_slot(NULL); -SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes FROM pg_stat_replication_slots ORDER BY slot_name; +SELECT slot_name, spill_txns = 0 AS spill_txns, spill_count = 0 AS spill_count, total_txns > 0 AS total_txns, total_bytes > 0 AS total_bytes, mem_exceeded_count = 0 AS mem_exceeded_count FROM pg_stat_replication_slots ORDER BY slot_name; -- verify accessing/resetting stats for non-existent slot does something reasonable SELECT * FROM pg_stat_get_replication_slot('do-not-exist'); @@ -41,7 +41,7 @@ SELECT count(*) FROM pg_logical_slot_peek_changes('regression_slot_stats1', NULL -- background transaction (say by autovacuum) happens in parallel to the main -- transaction. SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count FROM pg_stat_replication_slots; +SELECT slot_name, spill_txns > 0 AS spill_txns, spill_count > 0 AS spill_count, mem_exceeded_count > 0 AS mem_exceeded_count FROM pg_stat_replication_slots; -- Ensure stats can be repeatedly accessed using the same stats snapshot. See -- https://postgr.es/m/20210317230447.c7uc4g3vbs4wi32i%40alap3.anarazel.de @@ -65,7 +65,7 @@ SELECT count(*) FROM pg_logical_slot_get_changes('regression_slot_stats4_twophas -- Verify that the decoding doesn't spill already-aborted transaction's changes. SELECT pg_stat_force_next_flush(); -SELECT slot_name, spill_txns, spill_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; +SELECT slot_name, spill_txns, spill_count, mem_exceeded_count FROM pg_stat_replication_slots WHERE slot_name = 'regression_slot_stats4_twophase'; DROP TABLE stats_test; SELECT pg_drop_replication_slot('regression_slot_stats1'), diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3f4a27a736e..0141c00e666 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1620,6 +1620,17 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + mem_exceeded_countbigint + + + Number of times the memory used by logical decoding has exceeded + logical_decoding_work_mem. + + + + total_txns bigint diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index c77fa0234bb..b18e7c42d17 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1059,6 +1059,7 @@ CREATE VIEW pg_stat_replication_slots AS s.stream_txns, s.stream_count, s.stream_bytes, + s.mem_exceeded_count, s.total_txns, s.total_bytes, s.stats_reset diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index c68c0481f42..93ed2eb368e 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1955,10 +1955,11 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) PgStat_StatReplSlotEntry repSlotStat; /* Nothing to do if we don't have any replication stats to be sent. */ - if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0) + if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0 && + rb->memExceededCount <= 0) return; - elog(DEBUG2, "UpdateDecodingStats: updating stats %p %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, + elog(DEBUG2, "UpdateDecodingStats: updating stats %p %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, rb, rb->spillTxns, rb->spillCount, @@ -1966,6 +1967,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) rb->streamTxns, rb->streamCount, rb->streamBytes, + rb->memExceededCount, rb->totalTxns, rb->totalBytes); @@ -1975,6 +1977,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) repSlotStat.stream_txns = rb->streamTxns; repSlotStat.stream_count = rb->streamCount; repSlotStat.stream_bytes = rb->streamBytes; + repSlotStat.mem_exceeded_count = rb->memExceededCount; repSlotStat.total_txns = rb->totalTxns; repSlotStat.total_bytes = rb->totalBytes; @@ -1986,6 +1989,7 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) rb->streamTxns = 0; rb->streamCount = 0; rb->streamBytes = 0; + rb->memExceededCount = 0; rb->totalTxns = 0; rb->totalBytes = 0; } diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index a5e165fb123..6e72864804e 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -390,6 +390,7 @@ ReorderBufferAllocate(void) buffer->streamTxns = 0; buffer->streamCount = 0; buffer->streamBytes = 0; + buffer->memExceededCount = 0; buffer->totalTxns = 0; buffer->totalBytes = 0; @@ -3898,13 +3899,17 @@ static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) { ReorderBufferTXN *txn; + bool memory_limit_reached = (rb->size >= logical_decoding_work_mem * (Size) 1024); + + if (memory_limit_reached) + rb->memExceededCount += 1; /* * Bail out if debug_logical_replication_streaming is buffered and we * haven't exceeded the memory limit. */ if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED && - rb->size < logical_decoding_work_mem * (Size) 1024) + !memory_limit_reached) return; /* diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c index ccfb11c49bf..d210c261ac6 100644 --- a/src/backend/utils/activity/pgstat_replslot.c +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -94,6 +94,7 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re REPLSLOT_ACC(stream_txns); REPLSLOT_ACC(stream_count); REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(mem_exceeded_count); REPLSLOT_ACC(total_txns); REPLSLOT_ACC(total_bytes); #undef REPLSLOT_ACC diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index c756c2bebaa..e64dd5e043e 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -2100,7 +2100,7 @@ pg_stat_get_archiver(PG_FUNCTION_ARGS) Datum pg_stat_get_replication_slot(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_REPLICATION_SLOT_COLS 10 +#define PG_STAT_GET_REPLICATION_SLOT_COLS 11 text *slotname_text = PG_GETARG_TEXT_P(0); NameData slotname; TupleDesc tupdesc; @@ -2125,11 +2125,13 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "stream_bytes", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "total_txns", + TupleDescInitEntry(tupdesc, (AttrNumber) 8, "mem_exceeded_count", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "total_bytes", + TupleDescInitEntry(tupdesc, (AttrNumber) 9, "total_txns", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 10, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 10, "total_bytes", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); @@ -2152,13 +2154,14 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) values[4] = Int64GetDatum(slotent->stream_txns); values[5] = Int64GetDatum(slotent->stream_count); values[6] = Int64GetDatum(slotent->stream_bytes); - values[7] = Int64GetDatum(slotent->total_txns); - values[8] = Int64GetDatum(slotent->total_bytes); + values[7] = Int64GetDatum(slotent->mem_exceeded_count); + values[8] = Int64GetDatum(slotent->total_txns); + values[9] = Int64GetDatum(slotent->total_bytes); if (slotent->stat_reset_timestamp == 0) - nulls[9] = true; + nulls[10] = true; else - values[9] = TimestampTzGetDatum(slotent->stat_reset_timestamp); + values[10] = TimestampTzGetDatum(slotent->stat_reset_timestamp); /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 01eba3b5a19..4ea72c85240 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5687,9 +5687,9 @@ { oid => '6169', descr => 'statistics: information about replication slot', proname => 'pg_stat_get_replication_slot', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => 'text', - proallargtypes => '{text,text,int8,int8,int8,int8,int8,int8,int8,int8,timestamptz}', - proargmodes => '{i,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{slot_name,slot_name,spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,stats_reset}', + proallargtypes => '{text,text,int8,int8,int8,int8,int8,int8,int8,int8,int8,timestamptz}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{slot_name,slot_name,spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,mem_exceeded_count,total_txns,total_bytes,stats_reset}', prosrc => 'pg_stat_get_replication_slot' }, { oid => '6230', descr => 'statistics: check if a stats object exists', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index e4a59a30b8c..737aa1cc551 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -394,6 +394,7 @@ typedef struct PgStat_StatReplSlotEntry PgStat_Counter stream_txns; PgStat_Counter stream_count; PgStat_Counter stream_bytes; + PgStat_Counter mem_exceeded_count; PgStat_Counter total_txns; PgStat_Counter total_bytes; TimestampTz stat_reset_timestamp; diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 91dc7e5e448..3cbe106a3c7 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -690,6 +690,9 @@ struct ReorderBuffer int64 streamCount; /* streaming invocation counter */ int64 streamBytes; /* amount of data decoded */ + /* Number of times the logical_decoding_work_mem limit has been reached */ + int64 memExceededCount; + /* * Statistics about all the transactions sent to the decoding output * plugin diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 35e8aad7701..f4f3a2a3018 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2138,11 +2138,12 @@ pg_stat_replication_slots| SELECT s.slot_name, s.stream_txns, s.stream_count, s.stream_bytes, + s.mem_exceeded_count, s.total_txns, s.total_bytes, s.stats_reset FROM pg_replication_slots r, - LATERAL pg_stat_get_replication_slot((r.slot_name)::text) s(slot_name, spill_txns, spill_count, spill_bytes, stream_txns, stream_count, stream_bytes, total_txns, total_bytes, stats_reset) + LATERAL pg_stat_get_replication_slot((r.slot_name)::text) s(slot_name, spill_txns, spill_count, spill_bytes, stream_txns, stream_count, stream_bytes, mem_exceeded_count, total_txns, total_bytes, stats_reset) WHERE (r.datoid IS NOT NULL); pg_stat_slru| SELECT name, blks_zeroed, -- 2.47.3