diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c4c841c..63fd001 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -615,7 +615,9 @@ ExecInsert(ModifyTableState *mtstate, * foreign table, tupleid is invalid; the FDW has to figure out * which row to delete using data from the planSlot. oldtuple is * passed to foreign table triggers; it is NULL when the foreign - * table has no relevant triggers. + * table has no relevant triggers. When this DELETE is a part of + * an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. * * Returns RETURNING result if any, otherwise NULL. * ---------------------------------------------------------------- @@ -628,6 +630,7 @@ ExecDelete(ModifyTableState *mtstate, EPQState *epqstate, EState *estate, bool *tupleDeleted, + TupleTableSlot **epqslot, bool processReturning, bool canSetTag, bool changingPart) @@ -775,19 +778,37 @@ ldelete:; if (!ItemPointerEquals(tupleid, &hufd.ctid)) { - TupleTableSlot *epqslot; + TupleTableSlot *my_epqslot; - epqslot = EvalPlanQual(estate, + my_epqslot = EvalPlanQual(estate, epqstate, resultRelationDesc, resultRelInfo->ri_RangeTableIndex, LockTupleExclusive, &hufd.ctid, hufd.xmax); - if (!TupIsNull(epqslot)) + if (!TupIsNull(my_epqslot)) { *tupleid = hufd.ctid; - goto ldelete; + + /* + * If this is part of an UPDATE of partition-key, the + * epq tuple will contain the changes from this + * transaction over and above the updates done by the + * other transaction. The caller should now use this + * tuple as its NEW tuple, rather than the earlier NEW + * tuple. + */ + if (epqslot) + { + *epqslot = my_epqslot; + return NULL; + } + else + { + /* Normal DELETE: So delete the re-fetched row. */ + goto ldelete; + } } } /* tuple already deleted; nothing to do */ @@ -1058,6 +1079,7 @@ lreplace:; { bool tuple_deleted; TupleTableSlot *ret_slot; + TupleTableSlot *epqslot = NULL; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; int map_index; TupleConversionMap *tupconv_map; @@ -1087,7 +1109,7 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, - estate, &tuple_deleted, false, + estate, &tuple_deleted, &epqslot, false, false /* canSetTag */ , true /* changingPart */ ); /* @@ -1111,7 +1133,23 @@ lreplace:; * resurrect it. */ if (!tuple_deleted) - return NULL; + { + /* + * epqslot will be typically NULL. But when ExecDelete() finds + * that another transaction has concurrently updated the same + * row, it re-fetches the row, skips the delete, and epqslot is + * set to the re-fetched tuple slot. In that case, we need to + * do all the checks again. + */ + if (TupIsNull(epqslot)) + return NULL; + else + { + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); + tuple = ExecMaterializeSlot(slot); + goto lreplace; + } + } /* * Updates set the transition capture map only when a new subplan @@ -2145,7 +2183,7 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag, + NULL, NULL, true, node->canSetTag, false /* changingPart */ ); break; default: diff --git a/src/test/isolation/expected/partition-key-update-1.out b/src/test/isolation/expected/partition-key-update-1.out index 37fe6a7..3c23c69 100644 --- a/src/test/isolation/expected/partition-key-update-1.out +++ b/src/test/isolation/expected/partition-key-update-1.out @@ -55,6 +55,19 @@ step s1u2: <... completed> error in steps s2c s1u2: ERROR: tuple to be locked was already moved to another partition due to concurrent update step s1c: COMMIT; +starting permutation: s1b s2b s2u3 s1u3 s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u3: UPDATE foo SET b = b || ' update2' WHERE a = 1; +step s1u3: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE a=1; +step s2c: COMMIT; +step s1u3: <... completed> +step s1c: COMMIT; +step s1s: SELECT * FROM foo ORDER BY 1; +a b + +2 ABC update2 update1 + starting permutation: s1b s2b s1u3pc s2i s1c s2c step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; diff --git a/src/test/isolation/specs/partition-key-update-1.spec b/src/test/isolation/specs/partition-key-update-1.spec index 8393f47..eee0c42 100644 --- a/src/test/isolation/specs/partition-key-update-1.spec +++ b/src/test/isolation/specs/partition-key-update-1.spec @@ -1,5 +1,9 @@ -# Test that an error if thrown if the target row has been moved to a -# different partition by a concurrent session. +# Test concurrency scenarios when a row is being moved to a different partition: +# +# 1. An error is thrown if the target row has been moved to a different +# partition by a concurrent session. +# 2. Row that ends up in a new partition should contain changes made by +# a concurrent transaction. setup { @@ -50,8 +54,10 @@ session "s1" step "s1b" { BEGIN ISOLATION LEVEL READ COMMITTED; } step "s1u" { UPDATE foo SET a=2 WHERE a=1; } step "s1u2" { UPDATE footrg SET b='EFG' WHERE a=1; } +step "s1u3" { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE a=1; } step "s1u3pc" { UPDATE foo_range_parted SET a=11 WHERE a=7; } step "s1u3npc" { UPDATE foo_range_parted SET b='XYZ' WHERE a=7; } +step "s1s" { SELECT * FROM foo ORDER BY 1; } step "s1c" { COMMIT; } step "s1r" { ROLLBACK; } @@ -59,6 +65,7 @@ session "s2" step "s2b" { BEGIN ISOLATION LEVEL READ COMMITTED; } step "s2u" { UPDATE foo SET b='EFG' WHERE a=1; } step "s2u2" { UPDATE footrg SET b='XYZ' WHERE a=1; } +step "s2u3" { UPDATE foo SET b = b || ' update2' WHERE a = 1; } step "s2i" { INSERT INTO bar VALUES(7); } step "s2d" { DELETE FROM foo WHERE a=1; } step "s2c" { COMMIT; } @@ -73,6 +80,11 @@ permutation "s1b" "s2b" "s1u2" "s1c" "s2u2" "s2c" permutation "s1b" "s2b" "s1u2" "s2u2" "s1c" "s2c" permutation "s1b" "s2b" "s2u2" "s1u2" "s2c" "s1c" +# Transaction A is moving a row into another partition, but is waiting for +# another transaction B that is updating the original row. The row that ends up +# in the new partition should contain the changes made by transaction B. +permutation "s1b" "s2b" "s2u3" "s1u3" "s2c" "s1c" "s1s" + # Concurrency error from ExecLockRows # test waiting for moved row itself permutation "s1b" "s2b" "s1u3pc" "s2i" "s1c" "s2c"