crash during cascaded foreign key update

Started by Amit Langotealmost 5 years ago5 messages
#1Amit Langote
amitlangote09@gmail.com

With HEAD (I think v12 and greater), I see $subject when trying out
the following scenario:

-- in backend 1
create table p (a int primary key);
create table f (a int references p on update cascade deferrable
initially deferred);
insert into p values (1);
begin isolation level serializable;
insert into p values (3);

-- in another backend
insert into f values (1)

-- back in backend 1
update p set a = a + 1;
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.

I see the following backtrace:

#0 0x00007f747e6e2387 in raise () from /lib64/libc.so.6
#1 0x00007f747e6e3a78 in abort () from /lib64/libc.so.6
#2 0x0000000000ae056a in ExceptionalCondition (
conditionName=0xb67c10 "!ItemPointerEquals(&oldtup.t_self,
&oldtup.t_data->t_ctid)",
errorType=0xb66d89 "FailedAssertion", fileName=0xb66e68
"heapam.c", lineNumber=3560) at assert.c:69
#3 0x00000000004eed16 in heap_update (relation=0x7f747f569590,
otid=0x7ffe6f236ec0, newtup=0x1c214b8, cid=2,
crosscheck=0x1c317f8, wait=true, tmfd=0x7ffe6f236df0,
lockmode=0x7ffe6f236dec) at heapam.c:3560
#4 0x00000000004fdb52 in heapam_tuple_update
(relation=0x7f747f569590, otid=0x7ffe6f236ec0, slot=0x1c43fc8, cid=2,
snapshot=0x1c31a30, crosscheck=0x1c317f8, wait=true,
tmfd=0x7ffe6f236df0, lockmode=0x7ffe6f236dec,
update_indexes=0x7ffe6f236deb) at heapam_handler.c:327
#5 0x000000000075a7fc in table_tuple_update (rel=0x7f747f569590,
otid=0x7ffe6f236ec0, slot=0x1c43fc8, cid=2,
snapshot=0x1c31a30, crosscheck=0x1c317f8, wait=true,
tmfd=0x7ffe6f236df0, lockmode=0x7ffe6f236dec,
update_indexes=0x7ffe6f236deb) at ../../../src/include/access/tableam.h:1509
#6 0x000000000075cd20 in ExecUpdate (mtstate=0x1c42540,
resultRelInfo=0x1c42778, tupleid=0x7ffe6f236ec0, oldtuple=0x0,
slot=0x1c43fc8, planSlot=0x1c43e78, epqstate=0x1c42638,
estate=0x1c422d0, canSetTag=true) at nodeModifyTable.c:1498
#7 0x000000000075e0a3 in ExecModifyTable (pstate=0x1c42540) at
nodeModifyTable.c:2254
#8 0x000000000072674e in ExecProcNodeFirst (node=0x1c42540) at
execProcnode.c:456
#9 0x000000000071b13b in ExecProcNode (node=0x1c42540) at
../../../src/include/executor/executor.h:247
#10 0x000000000071d8f3 in ExecutePlan (estate=0x1c422d0,
planstate=0x1c42540, use_parallel_mode=false, operation=CMD_UPDATE,
sendTuples=false, numberTuples=0, direction=ForwardScanDirection,
dest=0xcb1440 <spi_printtupDR>, execute_once=true)
at execMain.c:1531
#11 0x000000000071b75f in standard_ExecutorRun (queryDesc=0x1c4cd18,
direction=ForwardScanDirection, count=0,
execute_once=true) at execMain.c:350
#12 0x000000000071b587 in ExecutorRun (queryDesc=0x1c4cd18,
direction=ForwardScanDirection, count=0, execute_once=true)
at execMain.c:294
#13 0x0000000000777a88 in _SPI_pquery (queryDesc=0x1c4cd18,
fire_triggers=false, tcount=0) at spi.c:2729
#14 0x00000000007774fa in _SPI_execute_plan (plan=0x1bf93d0,
paramLI=0x1c402c0, snapshot=0x1036840 <SecondarySnapshotData>,
crosscheck_snapshot=0x1c317f8, read_only=false,
no_snapshots=false, fire_triggers=false, tcount=0, caller_dest=0x0,
plan_owner=0x1bc1c10) at spi.c:2500
#15 0x00000000007740a9 in SPI_execute_snapshot (plan=0x1bf93d0,
Values=0x7ffe6f237340, Nulls=0x7ffe6f237300 " ",
snapshot=0x1036840 <SecondarySnapshotData>,
crosscheck_snapshot=0x1c317f8, read_only=false, fire_triggers=false,
tcount=0)
at spi.c:693
#16 0x0000000000a52724 in ri_PerformCheck (riinfo=0x1c3f2f8,
qkey=0x7ffe6f2378a0, qplan=0x1bf93d0, fk_rel=0x7f747f569590,
pk_rel=0x7f747f564a30, oldslot=0x1c042b8, newslot=0x1c04420,
detectNewRows=true, expect_OK=9) at ri_triggers.c:2517
#17 0x0000000000a4fee5 in RI_FKey_cascade_upd (fcinfo=0x7ffe6f237a60)
at ri_triggers.c:1163
#18 0x00000000006ea114 in ExecCallTriggerFunc
(trigdata=0x7ffe6f237b00, tgindx=1, finfo=0x1bc5be0, instr=0x0,
per_tuple_context=0x1c06760) at trigger.c:2141
#19 0x00000000006ed216 in AfterTriggerExecute (estate=0x1bc52f0,
event=0x1c196c0, relInfo=0x1bc5798, trigdesc=0x1bc59d0,
finfo=0x1bc5bb0, instr=0x0, per_tuple_context=0x1c06760,
trig_tuple_slot1=0x0, trig_tuple_slot2=0x0) at trigger.c:4030
#20 0x00000000006ed6e5 in afterTriggerInvokeEvents (events=0x1c31ac8,
firing_id=1, estate=0x1bc52f0, delete_ok=false)
at trigger.c:4244
#21 0x00000000006ede4c in AfterTriggerEndQuery (estate=0x1bc52f0) at
trigger.c:4581
#22 0x000000000071b90c in standard_ExecutorFinish
(queryDesc=0x1c13040) at execMain.c:425
#23 0x000000000071b803 in ExecutorFinish (queryDesc=0x1c13040) at execMain.c:393
#24 0x0000000000955ec6 in ProcessQuery (plan=0x1c51b30,
sourceText=0x1b424a0 "update p set a = a + 1;", params=0x0,
queryEnv=0x0, dest=0x1c51ca0, qc=0x7ffe6f237f20) at pquery.c:190
#25 0x0000000000957701 in PortalRunMulti (portal=0x1ba4980,
isTopLevel=true, setHoldSnapshot=false, dest=0x1c51ca0,
altdest=0x1c51ca0, qc=0x7ffe6f237f20) at pquery.c:1267
#26 0x0000000000956ca5 in PortalRun (portal=0x1ba4980,
count=9223372036854775807, isTopLevel=true, run_once=true,
dest=0x1c51ca0, altdest=0x1c51ca0, qc=0x7ffe6f237f20) at pquery.c:779
#27 0x0000000000950c2a in exec_simple_query (query_string=0x1b424a0
"update p set a = a + 1;") at postgres.c:1173
#28 0x0000000000954e06 in PostgresMain (argc=1, argv=0x7ffe6f2381b0,
dbname=0x1b6c868 "postgres", username=0x1b6c848 "amit")
at postgres.c:4327
#29 0x0000000000896188 in BackendRun (port=0x1b64130) at postmaster.c:4465
#30 0x0000000000895b0e in BackendStartup (port=0x1b64130) at postmaster.c:4187
#31 0x0000000000892174 in ServerLoop () at postmaster.c:1736
#32 0x0000000000891a4b in PostmasterMain (argc=3, argv=0x1b3cc20) at
postmaster.c:1408
#33 0x000000000079360f in main (argc=3, argv=0x1b3cc20) at main.c:209

I haven't checked the failure in more detail yet other than that the
failed Assert was added in 5db6df0c0117.

--
Amit Langote
EDB: http://www.enterprisedb.com

#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Amit Langote (#1)
Re: crash during cascaded foreign key update

Amit Langote <amitlangote09@gmail.com> writes:

With HEAD (I think v12 and greater), I see $subject when trying out
the following scenario:

I wonder if this is related to

/messages/by-id/89429.1584443208@antos

which we've still not done anything about.

regards, tom lane

#3Ranier Vilela
ranier.vf@gmail.com
In reply to: Tom Lane (#2)
re: crash during cascaded foreign key update

0 0x00007f747e6e2387 in raise () from /lib64/libc.so.6
#1 0x00007f747e6e3a78 in abort () from /lib64/libc.so.6
#2 0x0000000000ae056a in ExceptionalCondition (
conditionName=0xb67c10 "!ItemPointerEquals(&oldtup.t_self,
&oldtup.t_data->t_ctid)",
errorType=0xb66d89 "FailedAssertion", fileName=0xb66e68
"heapam.c", lineNumber=3560) at assert.c:69
#3 0x00000000004eed16 in heap_update (relation=0x7f747f569590,
otid=0x7ffe6f236ec0, newtup=0x1c214b8, cid=2,
crosscheck=0x1c317f8, wait=true, tmfd=0x7ffe6f236df0,

l>ockmode=0x7ffe6f236dec) at heapam.c:3560

I have this report from one static analysis tool:
heapam.c (9379):
Dereferencing of a potential null pointer 'oldtup.t_data'

regards,
Ranier Vilela

#4Amit Langote
amitlangote09@gmail.com
In reply to: Tom Lane (#2)
Re: crash during cascaded foreign key update

On Tue, Mar 16, 2021 at 11:17 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:

Amit Langote <amitlangote09@gmail.com> writes:

With HEAD (I think v12 and greater), I see $subject when trying out
the following scenario:

I wonder if this is related to

/messages/by-id/89429.1584443208@antos

which we've still not done anything about.

Ah, indeed the same issue. Will read through that discussion first, thanks.

--
Amit Langote
EDB: http://www.enterprisedb.com

#5Amit Langote
amitlangote09@gmail.com
In reply to: Amit Langote (#4)
1 attachment(s)
Re: crash during cascaded foreign key update

On Wed, Mar 17, 2021 at 11:01 AM Amit Langote <amitlangote09@gmail.com> wrote:

On Tue, Mar 16, 2021 at 11:17 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:

Amit Langote <amitlangote09@gmail.com> writes:

With HEAD (I think v12 and greater), I see $subject when trying out
the following scenario:

Actually, the crash is reproducible in all supported versions (9.6~),
although the failing Assert is different in branches older than 12.
The underlying issue is nevertheless the same.

I wonder if this is related to

/messages/by-id/89429.1584443208@antos

which we've still not done anything about.

Ah, indeed the same issue. Will read through that discussion first, thanks.

So, it appears that there's no live bug that actually manifests in the
real world use cases, even though the failing Assert shows that the
crosscheck snapshot handling code has grown inconsistent with its
surrounding code since it was first added in commit 55d85f42a89,
which, if I read correctly, also seems to the main conclusion of the
linked thread. I found the 2nd email in that thread very helpful to
understand the problem.

As for a solution, how about making heap_update() and heap_delete()
themselves report the error immediately upon a tuple failing the
crosscheck snapshot visibility test, instead of leaving it to the
caller which would definitely report the error in this case AFAICS?
If we do that, we don't really have to bother with figuring out sane
result codes for the crosscheck snapshot failure case. It also sounds
like that would be the easiest solution to back-patch.

I've attached a patch for that, which also adds isolation tests for these cases.

--
Amit Langote
EDB: http://www.enterprisedb.com

Attachments:

v1-0001-Tweak-handling-of-serialization-failures-during-c.patchapplication/octet-stream; name=v1-0001-Tweak-handling-of-serialization-failures-during-c.patchDownload
From ea414e6b16be36f28b4150bf1e9605f89b7edb15 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Thu, 18 Mar 2021 14:01:27 +0900
Subject: [PATCH v1] Tweak handling of serialization failures during cascaded
 RI update/delete

The current coding for signaling a serializability failure during
cascaded update or delete of a row not visible to the start-of-the-
transaction snapshot involves a hack whereby the table AM forcefully
returns a status code that basically tells the executor that the row
has been concurrently updated such that the executor can issue an
error if needed.  However, the blocks of code that generate other
information to return about the failing row cannot do so in all cases
of the aforementioned hack, which can manifest as Assert failures in
those blocks.

To fix, simply error out immediately in the table AM itself rather
than in the caller, which avoids the gymnastics of returning the
correct failure information.

This also adds an isolation tests for some affected cases.
---
 src/backend/access/heap/heapam.c              | 13 +++++----
 .../isolation/expected/fk-serializable.out    | 19 ++++++++++++
 src/test/isolation/isolation_schedule         |  1 +
 src/test/isolation/specs/fk-serializable.spec | 29 +++++++++++++++++++
 4 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 src/test/isolation/expected/fk-serializable.out
 create mode 100644 src/test/isolation/specs/fk-serializable.spec

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 7cb87f4a3b..baf6e2444e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2925,8 +2925,11 @@ l1:
 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
 	{
 		/* Perform additional check for transaction-snapshot mode RI updates */
+		Assert(IsolationUsesXactSnapshot());
 		if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
-			result = TM_Updated;
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent update")));
 	}
 
 	if (result != TM_Ok)
@@ -3554,11 +3557,11 @@ l2:
 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
 	{
 		/* Perform additional check for transaction-snapshot mode RI updates */
+		Assert(IsolationUsesXactSnapshot());
 		if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
-		{
-			result = TM_Updated;
-			Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
-		}
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent update")));
 	}
 
 	if (result != TM_Ok)
diff --git a/src/test/isolation/expected/fk-serializable.out b/src/test/isolation/expected/fk-serializable.out
new file mode 100644
index 0000000000..b7bc22a973
--- /dev/null
+++ b/src/test/isolation/expected/fk-serializable.out
@@ -0,0 +1,19 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1spk s2ifk s1dpk s1c
+step s1spk: TABLE fk;
+a              
+
+step s2ifk: INSERT INTO fk VALUES (1);
+step s1dpk: DELETE FROM pk;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1spk s2ifk s1upk s1c
+step s1spk: TABLE fk;
+a              
+
+step s2ifk: INSERT INTO fk VALUES (1);
+step s1upk: UPDATE pk SET a = a + 1;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 5d6b79e66e..f97e45f0dc 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -29,6 +29,7 @@ test: fk-deadlock
 test: fk-deadlock2
 test: fk-partitioned-1
 test: fk-partitioned-2
+test: fk-serializable
 test: eval-plan-qual
 test: eval-plan-qual-trigger
 test: lock-update-delete
diff --git a/src/test/isolation/specs/fk-serializable.spec b/src/test/isolation/specs/fk-serializable.spec
new file mode 100644
index 0000000000..dfca06fe3d
--- /dev/null
+++ b/src/test/isolation/specs/fk-serializable.spec
@@ -0,0 +1,29 @@
+
+
+setup
+{
+ CREATE TABLE pk (a int PRIMARY KEY);
+ CREATE TABLE fk (a int REFERENCES pk ON DELETE CASCADE ON UPDATE CASCADE);
+ INSERT INTO pk VALUES (1);
+}
+
+teardown
+{
+ DROP TABLE fk, pk;
+}
+
+session "s1"
+
+setup	{ BEGIN ISOLATION LEVEL SERIALIZABLE; }
+step "s1spk" { TABLE fk; }
+step "s1dpk" { DELETE FROM pk; }
+step "s1upk" { UPDATE pk SET a = a + 1; }
+step "s1c"	{ COMMIT; }
+
+
+session "s2"
+
+step "s2ifk" { INSERT INTO fk VALUES (1); }
+
+permutation "s1spk" "s2ifk" "s1dpk" "s1c"
+permutation "s1spk" "s2ifk" "s1upk" "s1c"
-- 
2.24.1