From 6d8b6e58cbbf50b85d5190fce2e40f0bc2f2a965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabr=C3=ADzio=20de=20Royes=20Mello?= Date: Mon, 20 Apr 2026 17:01:04 -0300 Subject: [PATCH] pg_surgery: Fix WAL corruption from concurrent heap_force_kill heap_force_kill could produce WAL records with CRC mismatches when multiple sessions operated on heap pages sharing the same visibility map page. visibilitymap_clear() acquires an exclusive lock on the VM buffer, modifies it, and releases the lock before returning. The subsequent log_newpage_buffer(vmbuf) then writes a full-page image without holding any content lock. XLogInsert reads the page twice via the XLogRecData chain pointer to shared memory: once to compute the CRC in XLogRecordAssemble, and once to copy the data in CopyXLogRecordToWAL. A concurrent modification between those two reads produces a CRC that does not match the written bytes. Fix by re-acquiring the VM buffer exclusive lock immediately after visibilitymap_clear() returns, and releasing it after log_newpage_buffer() completes. The lock is only acquired when RelationNeedsWAL() is true, since unlogged relations skip the FPI write entirely. A TAP test using injection points and pg_walinspect verifies that no CRC corruption occurs under concurrent heap_force_kill. Without the fix, the test detects "incorrect resource manager data checksum" in the WAL. Made-with: Cursor --- contrib/pg_surgery/Makefile | 2 + contrib/pg_surgery/heap_surgery.c | 28 +++ contrib/pg_surgery/meson.build | 8 + contrib/pg_surgery/t/001_wal_corruption.pl | 216 +++++++++++++++++++++ src/backend/access/transam/xloginsert.c | 13 ++ 5 files changed, 267 insertions(+) create mode 100644 contrib/pg_surgery/t/001_wal_corruption.pl diff --git a/contrib/pg_surgery/Makefile b/contrib/pg_surgery/Makefile index a66776c4c41..13c8433e00a 100644 --- a/contrib/pg_surgery/Makefile +++ b/contrib/pg_surgery/Makefile @@ -11,6 +11,8 @@ PGFILEDESC = "pg_surgery - perform surgery on a damaged relation" REGRESS = heap_surgery +TAP_TESTS = 1 + ifdef USE_PGXS PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) diff --git a/contrib/pg_surgery/heap_surgery.c b/contrib/pg_surgery/heap_surgery.c index ae4e7c0136c..fb1758c6dab 100644 --- a/contrib/pg_surgery/heap_surgery.c +++ b/contrib/pg_surgery/heap_surgery.c @@ -21,6 +21,7 @@ #include "storage/bufmgr.h" #include "utils/acl.h" #include "utils/array.h" +#include "utils/injection_point.h" #include "utils/rel.h" PG_MODULE_MAGIC_EXT( @@ -237,8 +238,20 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) * if it appears to be necessary. */ if (heap_force_opt == HEAP_FORCE_KILL && PageIsAllVisible(page)) + { visibilitymap_pin(rel, blkno, &vmbuf); + /* + * Run an injection point outside the critical section to force + * shared memory initialization (DSM registry) for the wait + * machinery, then load the callback for the cached point that + * runs inside the critical section. + */ + INJECTION_POINT("heap-force-kill-vm-pin", NULL); + INJECTION_POINT_LOAD("heap-force-kill-before-vm-wal"); + INJECTION_POINT_LOAD("wal-insert-after-crc"); + } + /* No ereport(ERROR) from here until all the changes are logged. */ START_CRIT_SECTION(); @@ -269,6 +282,15 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) PageClearAllVisible(page); visibilitymap_clear(rel, blkno, vmbuf, VISIBILITYMAP_VALID_BITS); + + /* + * Re-acquire the VM buffer lock so the page content + * stays stable through log_newpage_buffer's XLogInsert + * (which reads the page twice: once for CRC, once for + * the data copy). Only needed when WAL-logging. + */ + if (RelationNeedsWAL(rel)) + LockBuffer(vmbuf, BUFFER_LOCK_EXCLUSIVE); did_modify_vm = true; } } @@ -324,8 +346,14 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) } /* WAL log the VM page if it was modified. */ + if (did_modify_vm) + INJECTION_POINT_CACHED("heap-force-kill-before-vm-wal", NULL); + if (did_modify_vm && RelationNeedsWAL(rel)) + { log_newpage_buffer(vmbuf, false); + LockBuffer(vmbuf, BUFFER_LOCK_UNLOCK); + } END_CRIT_SECTION(); diff --git a/contrib/pg_surgery/meson.build b/contrib/pg_surgery/meson.build index 88e16dcc1b2..d383ffbebd1 100644 --- a/contrib/pg_surgery/meson.build +++ b/contrib/pg_surgery/meson.build @@ -32,4 +32,12 @@ tests += { 'heap_surgery', ], }, + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_wal_corruption.pl', + ], + }, } diff --git a/contrib/pg_surgery/t/001_wal_corruption.pl b/contrib/pg_surgery/t/001_wal_corruption.pl new file mode 100644 index 00000000000..e1ba11eee33 --- /dev/null +++ b/contrib/pg_surgery/t/001_wal_corruption.pl @@ -0,0 +1,216 @@ +# Copyright (c) 2024-2026, PostgreSQL Global Development Group +# +# Test for the VM buffer locking fix in heap_force_kill. +# +# heap_force_kill writes two FPIs per page when the page is all-visible: +# one for the heap page and one for the visibility map page. The VM +# buffer lock is re-acquired after visibilitymap_clear() (which releases +# it) and held through the log_newpage_buffer(vmbuf) call. This +# prevents concurrent modification during XLogInsert, which reads the +# page twice: once for CRC in XLogRecordAssemble and once for the data +# copy in CopyXLogRecordToWAL. +# +# Without the fix, concurrent modification of the VM page between those +# two reads produces a CRC mismatch -- the exact WAL corruption observed +# in production under 50 concurrent heap_force_kill sessions. +# +# This test uses three injection points: +# +# 1. "heap-force-kill-vm-pin" -- outside critical section, forces DSM +# shared memory initialization for the wait machinery. +# +# 2. "heap-force-kill-before-vm-wal" -- inside critical section, between +# the heap page FPI and VM page FPI writes. Used as a synchronization +# barrier so the test knows the heap FPI is complete. +# +# 3. "wal-insert-after-crc" -- inside XLogInsert, between +# XLogRecordAssemble (CRC done) and XLogInsertRecord (data copy). +# This fires twice: once for the heap FPI, once for the VM FPI. +# With the fix, the VM buffer lock is held during the VM FPI's +# XLogInsert, so concurrent modification cannot corrupt the CRC. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +if (!defined($ENV{enable_injection_points}) + || $ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init; +$node->start; + +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} +if (!$node->check_extension('pg_surgery')) +{ + plan skip_all => 'Extension pg_surgery not installed'; +} +if (!$node->check_extension('pg_walinspect')) +{ + plan skip_all => 'Extension pg_walinspect not installed'; +} + +$node->safe_psql('postgres', q{CREATE EXTENSION injection_points}); +$node->safe_psql('postgres', q{CREATE EXTENSION pg_surgery}); +$node->safe_psql('postgres', q{CREATE EXTENSION pg_walinspect}); + +# Create a table with enough rows to span at least two heap pages. +# With a single int column (~28 bytes/tuple including header), each 8kB +# page holds ~226 tuples. 500 rows gives us pages 0, 1, and 2 -- all +# mapping to VM page 0 (which covers heap blocks 0 through 32767). +$node->safe_psql('postgres', q{ + CREATE TABLE test_vm (a int); + INSERT INTO test_vm SELECT generate_series(1, 500); +}); + +# VACUUM to set all pages all-visible. This is required for +# heap_force_kill to enter the VM modification path (PageIsAllVisible +# must be true). +$node->safe_psql('postgres', q{VACUUM test_vm}); + +# Verify that we have tuples on at least two distinct pages. +my $page0_count = $node->safe_psql('postgres', + q{SELECT count(*) FROM test_vm WHERE ctid >= '(0,1)' AND ctid < '(1,0)'}); +my $page1_count = $node->safe_psql('postgres', + q{SELECT count(*) FROM test_vm WHERE ctid >= '(1,1)' AND ctid < '(2,0)'}); +cmp_ok($page0_count, '>', 0, 'page 0 has tuples'); +cmp_ok($page1_count, '>', 0, 'page 1 has tuples'); + +# Record WAL position before the test so we can validate WAL afterward. +my $before_lsn = $node->safe_psql('postgres', + q{SELECT pg_current_wal_lsn()}); + +# Session 1: attach all three injection points (local to this PID) and +# run heap_force_kill on a tuple on page 0. +my $s1 = $node->background_psql('postgres'); +$s1->query_safe(q{ + SELECT injection_points_set_local(); + SELECT injection_points_attach('heap-force-kill-vm-pin', 'wait'); + SELECT injection_points_attach('heap-force-kill-before-vm-wal', 'wait'); + SELECT injection_points_attach('wal-insert-after-crc', 'wait'); +}); + +$s1->query_until( + qr/starting_heap_force_kill/, + q(\echo starting_heap_force_kill +SELECT heap_force_kill('test_vm'::regclass, ARRAY['(0,1)']::tid[]); +)); + +# ---- Pause 1: "heap-force-kill-vm-pin" (outside critical section) ---- +# Forces DSM initialization for the injection_wait machinery. +$node->wait_for_event('client backend', 'heap-force-kill-vm-pin'); +$node->safe_psql('postgres', + q{SELECT injection_points_wakeup('heap-force-kill-vm-pin')}); +ok(1, 'pause 1: DSM initialized via heap-force-kill-vm-pin'); + +# ---- Pause 2: "wal-insert-after-crc" (heap FPI's XLogInsert) ---- +# Session 1 entered the critical section, marked tuples dead, called +# visibilitymap_clear, re-acquired the VM lock, and is now inside +# XLogInsert for the HEAP page FPI. CRC has been computed over the +# heap page; the page is still locked via LockBufferForCleanup so no +# concurrent modification is possible. Just wake it up. +$node->wait_for_event('client backend', 'wal-insert-after-crc'); +$node->safe_psql('postgres', + q{SELECT injection_points_wakeup('wal-insert-after-crc')}); +ok(1, 'pause 2: heap FPI CRC computed, waking to continue'); + +# ---- Pause 3: "heap-force-kill-before-vm-wal" (between FPIs) ---- +# The heap FPI has been written to WAL. Session 1 is now between the +# heap FPI and the VM FPI. With the fix, the VM buffer is LOCKED +# (re-acquired after visibilitymap_clear). +$node->wait_for_event('client backend', 'heap-force-kill-before-vm-wal'); +ok(1, 'pause 3: session 1 paused between heap FPI and VM FPI'); + +# Wake session 1 from the barrier. It will proceed to +# log_newpage_buffer(vmbuf) -> XLogInsert -> XLogRecordAssemble (CRC) +# -> hit "wal-insert-after-crc" again. +$node->safe_psql('postgres', + q{SELECT injection_points_wakeup('heap-force-kill-before-vm-wal')}); + +# ---- Pause 4: "wal-insert-after-crc" (VM FPI's XLogInsert) ---- +# Session 1 has computed the CRC over the VM page in +# XLogRecordAssemble, but has NOT yet copied the data to WAL buffers +# in CopyXLogRecordToWAL. With the fix, the VM buffer is exclusively +# locked, so session 2 will block. Without the fix, session 2 can +# modify the VM page and cause a CRC mismatch. +$node->wait_for_event('client backend', 'wal-insert-after-crc'); +ok(1, 'pause 4: VM FPI CRC computed'); + +# Session 2: run heap_force_kill on page 1 (same VM page) IN THE +# BACKGROUND while session 1 is paused inside XLogInsert. This is +# the critical concurrent modification: +# +# - With the fix: session 2 blocks on LockBuffer(vmbuf) inside +# visibilitymap_clear because session 1 holds the VM lock. +# No corruption. +# +# - Without the fix: session 2 completes, modifying the VM page +# between session 1's CRC computation and data copy. +# CRC mismatch. +my $s2 = $node->background_psql('postgres'); +$s2->query_until( + qr/starting_s2/, + q(\echo starting_s2 +SELECT heap_force_kill('test_vm'::regclass, ARRAY['(1,1)']::tid[]); +\echo s2_done +)); + +# Give session 2 time to reach the VM buffer lock (or complete if +# unfixed). We cannot reliably detect blocking from Perl, so just +# sleep briefly. +use Time::HiRes qw(usleep); +usleep(500_000); + +# Now wake session 1. It will proceed to CopyXLogRecordToWAL. +# If session 2 already modified the VM page (unfixed), the CRC won't +# match. If session 2 is blocked (fixed), the page is stable. +$node->safe_psql('postgres', q{ + SELECT injection_points_detach('wal-insert-after-crc'); + SELECT injection_points_wakeup('wal-insert-after-crc'); +}); + +# Wait for both sessions to finish. +$s1->query_until(qr/heap_force_kill_done/, + q(\echo heap_force_kill_done +)); +$s1->quit; + +$s2->query_until(qr/s2_done/, ''); +$s2->quit; + +# Both tuples should be dead (not visible via sequential scan). +my $dead_count = $node->safe_psql('postgres', + q{SELECT count(*) FROM test_vm WHERE ctid IN ('(0,1)', '(1,1)')}); +is($dead_count, '0', 'both target tuples are dead'); + +# Validate WAL with pg_walinspect. With the fix (VM buffer locked +# through log_newpage_buffer), the VM FPI should have a valid CRC. +# Without the fix, this would report "incorrect resource manager data +# checksum" because session 2 modified the VM page between CRC +# computation and data copy. +# +# Force a CHECKPOINT first so all WAL is flushed and readable by +# pg_walinspect (it reads up to GetFlushRecPtr). +$node->safe_psql('postgres', q{CHECKPOINT}); + +my ($ret, $stdout, $stderr) = $node->psql('postgres', qq{ + SELECT count(*) + FROM pg_get_wal_records_info('$before_lsn', pg_current_wal_lsn()); +}); +is($ret, 0, 'pg_walinspect reads all WAL without error'); +unlike($stderr, qr/incorrect resource manager data checksum/i, + 'no CRC mismatch in WAL (VM buffer locking fix works)'); +cmp_ok($stdout, '>', 0, + 'WAL contains records from heap_force_kill operations'); + +$node->stop; + +done_testing(); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index f2e10b82b7d..8e56e289779 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -39,6 +39,7 @@ #include "replication/origin.h" #include "storage/bufmgr.h" #include "storage/proc.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" #include "utils/rel.h" @@ -530,6 +531,18 @@ XLogInsert(RmgrId rmid, uint8 info) &fpw_lsn, &num_fpi, &fpi_bytes, &topxid_included); + /* + * Injection point after CRC has been computed but before the record + * data is copied to WAL buffers. The XLogRecData chain points + * directly at shared-buffer pages; if a concurrent backend modifies + * those pages now, the CRC won't match the written bytes. + * + * Only fires when pre-loaded by the caller (e.g. pg_surgery's + * heap_force_kill) via INJECTION_POINT_LOAD before the critical + * section. + */ + INJECTION_POINT_CACHED("wal-insert-after-crc", NULL); + EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi, fpi_bytes, topxid_included); } while (!XLogRecPtrIsValid(EndPos)); -- 2.53.0