logical decoding build wrong snapshot with subtransactions
This issue has been reported in the <pgsql-bugs> list at the below link, but
received almost no response:
/messages/by-id/18280-4c8060178cb41750@postgresql.org
Hoping for some feedback from kernel hackers, thanks!
Hi, hackers,
I've encountered a problem with logical decoding history snapshots. The
specific error message is: "ERROR: could not map filenode "base/5/16390" to
relation OID".
If a subtransaction that modified the catalog ends before the
restart_lsn of the logical replication slot, and the commit WAL record of
its top transaction is after the restart_lsn, the WAL record related to the
subtransaction won't be decoded during logical decoding. Therefore, the
subtransaction won't be marked as having modified the catalog, resulting in
its absence from the snapshot's committed list.
The issue seems to be caused by SnapBuildXidSetCatalogChanges
(introduced in 272248a) skipping checks for subtransactions when the top
transaction is marked as containing catalog changes.
The following steps can reproduce the problem (I increased the value of
LOG_SNAPSHOT_INTERVAL_MS to avoid the impact of bgwriter writing
XLOG_RUNNING_XACTS WAL records):
session 1:
```
CREATE TABLE tbl1 (val1 integer, val2 integer);
CREATE TABLE tbl1_part (val1 integer) PARTITION BY RANGE (val1);
SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding');
BEGIN;
SAVEPOINT sp1;
CREATE TABLE tbl1_part_p1 PARTITION OF tbl1_part FOR VALUES FROM (0) TO (10);
RELEASE SAVEPOINT sp1;
```
session 2:
```
CHECKPOINT;
```
session 1:
```
CREATE TABLE tbl1_part_p2 PARTITION OF tbl1_part FOR VALUES FROM (10) TO (20);
COMMIT;
BEGIN;
TRUNCATE tbl1;
```
session 2:
```
CHECKPOINT;
SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
INSERT INTO tbl1_part VALUES (1);
SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
```
To fix this issue, it is sufficient to remove the condition check for
ReorderBufferXidHasCatalogChanges in SnapBuildXidSetCatalogChanges.
This fix may add subtransactions that didn't change the catalog to the commit
list, which seems like a false positive. However, this is acceptable since
we only use the snapshot built during decoding to read system catalogs, as
stated in 272248a's commit message.
I have verified that the patch in the attachment resolves the issues
mentioned, and I added some test cases.
I am eager to hear your suggestions on this!
Best Regards,
Fei Changhong
Alibaba Cloud Computing Ltd.
Attachments:
fix_wrong_snapshot_for_logical_decoding.patchapplication/octet-stream; charset=gb18030; name=fix_wrong_snapshot_for_logical_decoding.patchDownload
diff --git a/contrib/test_decoding/expected/catalog_change_snapshot.out b/contrib/test_decoding/expected/catalog_change_snapshot.out
index b33e49c0b1..a2eadc5418 100644
--- a/contrib/test_decoding/expected/catalog_change_snapshot.out
+++ b/contrib/test_decoding/expected/catalog_change_snapshot.out
@@ -132,3 +132,41 @@ COMMIT
stop
(1 row)
+
+starting permutation: s0_init s0_begin s0_savepoint s0_create_part1 s0_savepoint_release s1_checkpoint s0_create_part2 s0_commit s0_begin s0_truncate s1_checkpoint s1_get_changes s0_insert_part s1_get_changes s0_commit
+step s0_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding');
+?column?
+--------
+init
+(1 row)
+
+step s0_begin: BEGIN;
+step s0_savepoint: SAVEPOINT sp1;
+step s0_create_part1: CREATE TABLE tbl1_part_p1 PARTITION OF tbl1_part FOR VALUES FROM (0) TO (10);
+step s0_savepoint_release: RELEASE SAVEPOINT sp1;
+step s1_checkpoint: CHECKPOINT;
+step s0_create_part2: CREATE TABLE tbl1_part_p2 PARTITION OF tbl1_part FOR VALUES FROM (10) TO (20);
+step s0_commit: COMMIT;
+step s0_begin: BEGIN;
+step s0_truncate: TRUNCATE tbl1;
+step s1_checkpoint: CHECKPOINT;
+step s1_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
+data
+----
+(0 rows)
+
+step s0_insert_part: INSERT INTO tbl1_part VALUES (1);
+step s1_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
+data
+--------------------------------------------------
+BEGIN
+table public.tbl1_part_p1: INSERT: val1[integer]:1
+COMMIT
+(3 rows)
+
+step s0_commit: COMMIT;
+?column?
+--------
+stop
+(1 row)
+
diff --git a/contrib/test_decoding/specs/catalog_change_snapshot.spec b/contrib/test_decoding/specs/catalog_change_snapshot.spec
index 770dbd642d..0a17a0dd17 100644
--- a/contrib/test_decoding/specs/catalog_change_snapshot.spec
+++ b/contrib/test_decoding/specs/catalog_change_snapshot.spec
@@ -3,13 +3,16 @@
setup
{
DROP TABLE IF EXISTS tbl1;
+ DROP TABLE IF EXISTS tbl1_part;
CREATE TABLE tbl1 (val1 integer, val2 integer);
+ CREATE TABLE tbl1_part (val1 integer) PARTITION BY RANGE (val1);
CREATE TABLE user_cat (val1 integer) WITH (user_catalog_table = true);
}
teardown
{
DROP TABLE tbl1;
+ DROP TABLE tbl1_part;
DROP TABLE user_cat;
SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot');
}
@@ -19,15 +22,19 @@ setup { SET synchronous_commit=on; }
step "s0_init" { SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); }
step "s0_begin" { BEGIN; }
step "s0_savepoint" { SAVEPOINT sp1; }
+step "s0_savepoint_release" { RELEASE SAVEPOINT sp1; }
step "s0_truncate" { TRUNCATE tbl1; }
step "s0_insert" { INSERT INTO tbl1 VALUES (1); }
step "s0_insert2" { INSERT INTO user_cat VALUES (1); }
+step "s0_create_part1" { CREATE TABLE tbl1_part_p1 PARTITION OF tbl1_part FOR VALUES FROM (0) TO (10); }
+step "s0_create_part2" { CREATE TABLE tbl1_part_p2 PARTITION OF tbl1_part FOR VALUES FROM (10) TO (20); }
step "s0_commit" { COMMIT; }
session "s1"
setup { SET synchronous_commit=on; }
step "s1_checkpoint" { CHECKPOINT; }
step "s1_get_changes" { SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0'); }
+step "s0_insert_part" { INSERT INTO tbl1_part VALUES (1); }
# For the transaction that TRUNCATEd the table tbl1, the last decoding decodes
# only its COMMIT record, because it starts from the RUNNING_XACTS record emitted
@@ -60,3 +67,9 @@ permutation "s0_init" "s0_begin" "s0_savepoint" "s0_insert" "s1_checkpoint" "s1_
# to skip this xact but ensure that corresponding invalidation messages
# get processed.
permutation "s0_init" "s0_begin" "s0_savepoint" "s0_insert" "s1_checkpoint" "s1_get_changes" "s0_truncate" "s0_commit" "s0_begin" "s0_insert" "s1_checkpoint" "s1_get_changes" "s0_commit" "s1_get_changes"
+
+# For the subtransaction that created the table tbl1_part_p1, the last decoding does
+# not decode any WAL records for it, because it ends before the RUNNING_XACTS record.
+# When decoding the COMMIT record of its parent transaction, the subtransaction also
+# needs to be added to the committed list because it modified the catalog.
+permutation "s0_init" "s0_begin" "s0_savepoint" "s0_create_part1" "s0_savepoint_release" "s1_checkpoint" "s0_create_part2" "s0_commit" "s0_begin" "s0_truncate" "s1_checkpoint" "s1_get_changes" "s0_insert_part" "s1_get_changes" "s0_commit"
\ No newline at end of file
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 3eaaaac6b3..91cc70e14b 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -2126,11 +2126,12 @@ SnapBuildXidSetCatalogChanges(SnapBuild *builder, TransactionId xid, int subxcnt
TransactionId *subxacts, XLogRecPtr lsn)
{
/*
- * Skip if there is no initial running xacts information or the
- * transaction is already marked as containing catalog changes.
+ * Skip if there is no initial running xacts information.
+ * Even if the transaction has been marked as containing catalog changes,
+ * it cannot be skipped because its subtransactions that modified the
+ * catalog may not be marked.
*/
- if (NInitialRunningXacts == 0 ||
- ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
+ if (NInitialRunningXacts == 0)
return;
if (bsearch(&xid, InitialRunningXacts, NInitialRunningXacts,
On Fri, Jan 19, 2024 at 1:05 PM feichanghong <feichanghong@qq.com> wrote:
This issue has been reported in the <pgsql-bugs> list at the below link, but
received almost no response:
/messages/by-id/18280-4c8060178cb41750@postgresql.org
Hoping for some feedback from kernel hackers, thanks!
Thanks for the report and analysis. I have responded to the original
thread. Let's discuss it there.
--
With Regards,
Amit Kapila.