From e522298608362461c8348ed883633668ad9ff187 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 15 Dec 2025 18:44:23 +0200
Subject: [PATCH v3 1/1] Fix bug in following update chain when locking a heap
 tuple

Author: Jasper Smit <jasper.smit@servicenow.com>
Discussion: https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
---
 src/backend/access/heap/heapam.c              | 47 ++++++++----
 src/test/modules/injection_points/Makefile    |  3 +-
 .../expected/heap_lock_update.out             | 73 +++++++++++++++++++
 src/test/modules/injection_points/meson.build |  1 +
 .../specs/heap_lock_update.spec               | 71 ++++++++++++++++++
 5 files changed, 179 insertions(+), 16 deletions(-)
 create mode 100644 src/test/modules/injection_points/expected/heap_lock_update.out
 create mode 100644 src/test/modules/injection_points/specs/heap_lock_update.spec

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 225f9829f22..53b217ea50b 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
 									  LockTupleMode mode, bool is_update,
 									  TransactionId *result_xmax, uint16 *result_infomask,
 									  uint16 *result_infomask2);
-static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
-										 const ItemPointerData *ctid, TransactionId xid,
+static TM_Result heap_lock_updated_tuple(Relation rel,
+										 uint16 prior_infomask,
+										 TransactionId prior_rawxmax,
+										 const ItemPointerData *prior_ctid,
+										 TransactionId xid,
 										 LockTupleMode mode);
 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
 								   uint16 *new_infomask2);
@@ -4816,11 +4819,12 @@ l3:
 				 * If there are updates, follow the update chain; bail out if
 				 * that cannot be done.
 				 */
-				if (follow_updates && updated)
+				if (follow_updates && updated && !ItemPointerEquals(&tuple->t_self, &t_ctid))
 				{
 					TM_Result	res;
 
-					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+					res = heap_lock_updated_tuple(relation,
+												  infomask, xwait, &t_ctid,
 												  GetCurrentTransactionId(),
 												  mode);
 					if (res != TM_Ok)
@@ -5063,11 +5067,13 @@ l3:
 			}
 
 			/* if there are updates, follow the update chain */
-			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+				!ItemPointerEquals(&tuple->t_self, &t_ctid))
 			{
 				TM_Result	res;
 
-				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+				res = heap_lock_updated_tuple(relation,
+											  infomask, xwait, &t_ctid,
 											  GetCurrentTransactionId(),
 											  mode);
 				if (res != TM_Ok)
@@ -5721,7 +5727,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
  * version as well.
  */
 static TM_Result
-heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, TransactionId xid,
+heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax,
+							const ItemPointerData *tid, TransactionId xid,
 							LockTupleMode mode)
 {
 	TM_Result	result;
@@ -5734,7 +5741,6 @@ heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, Transactio
 				old_infomask2;
 	TransactionId xmax,
 				new_xmax;
-	TransactionId priorXmax = InvalidTransactionId;
 	bool		cleared_all_frozen = false;
 	bool		pinned_desired_page;
 	Buffer		vmbuffer = InvalidBuffer;
@@ -6048,7 +6054,10 @@ out_unlocked:
  *		Follow update chain when locking an updated tuple, acquiring locks (row
  *		marks) on the updated versions.
  *
- * The initial tuple is assumed to be already locked.
+ * 'priorInfomask', 'priorRawXmax' and 'priorCtid' are the corresponding
+ * fields from the initial tuple.  We will update the tuples starting from the
+ * one that 'priorCtid' points to.  Locking the initial tuple where those
+ * values came from is the caller's responsibility.
  *
  * This function doesn't check visibility, it just unconditionally marks the
  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
@@ -6066,16 +6075,22 @@ out_unlocked:
  * levels, because that would lead to a serializability failure.
  */
 static TM_Result
-heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ctid,
+heap_lock_updated_tuple(Relation rel,
+						uint16 prior_infomask,
+						TransactionId prior_raw_xmax,
+						const ItemPointerData *prior_ctid,
 						TransactionId xid, LockTupleMode mode)
 {
+	INJECTION_POINT("heap_lock_updated_tuple", NULL);
+
 	/*
-	 * If the tuple has not been updated, or has moved into another partition
-	 * (effectively a delete) stop here.
+	 * If the tuple has moved into another partition (effectively a delete)
+	 * stop here.
 	 */
-	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
-		!ItemPointerEquals(&tuple->t_self, ctid))
+	if (!ItemPointerIndicatesMovedPartitions(prior_ctid))
 	{
+		TransactionId prior_xmax;
+
 		/*
 		 * If this is the first possibly-multixact-able operation in the
 		 * current transaction, set my per-backend OldestMemberMXactId
@@ -6087,7 +6102,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ct
 		 */
 		MultiXactIdSetOldestMember();
 
-		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+		prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ?
+			MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax;
+		return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode);
 	}
 
 	/* nothing to lock */
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile
index bfdb3f53377..cc9be6dcdd2 100644
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -14,7 +14,8 @@ REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 
 ISOLATION = basic \
 	    inplace \
-	    syscache-update-pruned
+	    syscache-update-pruned \
+	    heap_lock_update
 
 # Temporarily disabled because of flakiness
 #ISOLATION =+
diff --git a/src/test/modules/injection_points/expected/heap_lock_update.out b/src/test/modules/injection_points/expected/heap_lock_update.out
new file mode 100644
index 00000000000..9828e2b151b
--- /dev/null
+++ b/src/test/modules/injection_points/expected/heap_lock_update.out
@@ -0,0 +1,73 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert wake
+step s1begin: begin;
+step s1update: update t set id = 1000 where id = 1;
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: abort;
+step vacuum: VACUUM (TRUNCATE off);
+step reinsert: 
+	insert into t values (453) returning ctid; -- Should be (2,1)
+	update t set id = 454 where id = 453 returning ctid;
+
+ctid 
+-----
+(2,1)
+(1 row)
+
+ctid 
+-----
+(2,2)
+(1 row)
+
+step wake: 
+    SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+    SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert_and_lock wake
+step s1begin: begin;
+step s1update: update t set id = 1000 where id = 1;
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: abort;
+step vacuum: VACUUM (TRUNCATE off);
+step reinsert_and_lock: 
+        begin;
+	insert into t values (453) returning ctid; -- Should be (2,1)
+	select ctid, * from t where id = 1 for update;
+	commit;
+	update t set id = 454 where id = 453 returning ctid;
+
+ctid 
+-----
+(2,1)
+(1 row)
+
+ctid |id
+-----+--
+(0,1)| 1
+(1 row)
+
+ctid 
+-----
+(2,2)
+(1 row)
+
+step wake: 
+    SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+    SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build
index 493e11053dc..c9186ae04d1 100644
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -46,6 +46,7 @@ tests += {
       'basic',
       'inplace',
       'syscache-update-pruned',
+      'heap_lock_update',
       # temporarily disabled because of flakiness
       # 'index-concurrently-upsert',
       # 'index-concurrently-upsert-predicate',
diff --git a/src/test/modules/injection_points/specs/heap_lock_update.spec b/src/test/modules/injection_points/specs/heap_lock_update.spec
new file mode 100644
index 00000000000..93887312815
--- /dev/null
+++ b/src/test/modules/injection_points/specs/heap_lock_update.spec
@@ -0,0 +1,71 @@
+# Test race condition in tuple locking
+#
+# This is a reproducer for the bug reported at:
+# https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
+
+setup
+{
+    CREATE EXTENSION injection_points;
+
+    create table t (id int);
+    insert into t (select generate_series(1, 452));
+}
+teardown
+{
+    drop table t;
+    DROP EXTENSION injection_points;
+}
+
+session s1
+step s1begin	{ begin; }
+step s1update   { update t set id = 1000 where id = 1; }
+step s1abort	{ abort; }
+step vacuum	{ VACUUM (TRUNCATE off); }
+step reinsert   {
+	insert into t values (453) returning ctid; -- Should be (2,1)
+	update t set id = 454 where id = 453 returning ctid;
+}
+
+# Same as 'reinsert', but we also stamp the original tuple with the
+# same 'xmax' as the re-inserted one.
+step reinsert_and_lock {
+        begin;
+	insert into t values (453) returning ctid; -- Should be (2,1)
+	select ctid, * from t where id = 1 for update;
+	commit;
+	update t set id = 454 where id = 453 returning ctid;
+}
+
+step wake	{
+    SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+    SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+}
+
+session s2
+setup	{
+	SELECT FROM injection_points_set_local();
+	SELECT FROM injection_points_attach('heap_lock_updated_tuple', 'wait');
+}
+step s2lock	{ select * from t where id = 1 for update; }
+
+# Basic case
+permutation
+	s1begin
+	s1update
+	s2lock
+	s1abort
+	vacuum
+	reinsert
+	wake(s2lock)
+
+# Variant where the re-inserted tuple is also locked by the inserting
+# transaction. This failed an earlier version of the fix during
+# development.
+permutation
+	s1begin
+	s1update
+	s2lock
+	s1abort
+	vacuum
+	reinsert_and_lock
+	wake(s2lock)
-- 
2.47.3

