From 384ff0a5ee8ad40c7c47538f2bf4ccd46f4f20a0 Mon Sep 17 00:00:00 2001 From: Hou Zhijie Date: Tue, 17 Jan 2023 14:21:17 +0800 Subject: [PATCH] Fix the DROP DATABASE deadlocks with logical replication worker We decided to prevent cancel/die interrupts while creating slot here because it is possible that before the server finishes this command, a concurrent drop subscription happens which would complete without removing this slot leading to the slot still exist until the end of walsender. But the slot will eventually get dropped at the walsender exit time, and disallow termination when executing command over network will cause deadlock if user tries to drop a database concurrently. So, we re-allow cancel/die interrupts while creating slot and modify the test to wait for slots to become zero to prevent finding ephemeral slot. --- src/backend/replication/logical/tablesync.c | 7 ------- src/test/subscription/t/004_sync.pl | 10 +++++++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index 38dfce7129..da030d22d6 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -1388,17 +1388,10 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) * Create a new permanent logical decoding slot. This slot will be used * for the catchup phase after COPY is done, so tell it to use the * snapshot to make the final data consistent. - * - * Prevent cancel/die interrupts while creating slot here because it is - * possible that before the server finishes this command, a concurrent - * drop subscription happens which would complete without removing this - * slot leading to a dangling slot on the server. */ - HOLD_INTERRUPTS(); walrcv_create_slot(LogRepWorkerWalRcvConn, slotname, false /* permanent */ , false /* two_phase */ , CRS_USE_SNAPSHOT, origin_startpos); - RESUME_INTERRUPTS(); /* * Setup replication origin tracking. The purpose of doing this before the diff --git a/src/test/subscription/t/004_sync.pl b/src/test/subscription/t/004_sync.pl index d50c43b8e3..7f47af9996 100644 --- a/src/test/subscription/t/004_sync.pl +++ b/src/test/subscription/t/004_sync.pl @@ -163,9 +163,13 @@ $result = $node_subscriber->poll_query_until('postgres', $started_query) # subscriber is stuck on data copy for constraint violation. $node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); -$result = $node_publisher->safe_psql('postgres', - "SELECT count(*) FROM pg_replication_slots"); -is($result, qq(0), +# When DROP SUBSCRIPTION tries to drop the tablesync slot, the slot may not +# have been created, which causes the slot to be created after the DROP +# SUSCRIPTION finishes but eventually dropped at walsender exit time. So, to +# prevent being affected by such ephemeral tablesync slot, we wait until all +# the slots have been cleaned. +ok( $node_publisher->poll_query_until( + 'postgres', 'SELECT count(*) = 0 FROM pg_replication_slots'), 'DROP SUBSCRIPTION during error can clean up the slots on the publisher'); $node_subscriber->stop('fast'); -- 2.31.1