>From cf97a64f4091f105b94595d7027610fc4335d979 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 28 Nov 2013 19:17:21 -0300
Subject: [PATCH 5/5] Fix a couple of bugs in MultiXactId freezing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look
into a multixact to check the members against cutoff_xid.  This means
that a very old Xid could survive hidden within a multi, possibly
outliving its CLOG storage.  In the distant future, this would cause
clog lookup failures:
ERROR:  could not access status of transaction 3883960912
DETAIL:  Could not open file "pg_clog/0E78": No such file or directory.

This mostly was problematic when the updating transaction aborted, since
in that case the row wouldn't get pruned away earlier in vacuum and the
multixact could possibly survive for a long time.  In many cases, data
that is inaccessible for this reason way can be brought back
heuristically.

As a second bug, heap_freeze_tuple() didn't properly handle multixacts
that need to be frozen according to cutoff_multi, but whose updater xid
is still alive.  Instead of preserving the update Xid, it just set Xmax
invalid, which leads to both old and new tuple versions becoming
visible.  This is pretty rare in practice, but a real threat
nonetheless.  Existing corrupted rows, unfortunately, cannot be repaired
in an automated fashion.

Following code analysis caused by bug report by J Smith in message
CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com
and privately by F-Secure.

Backpatch to 9.3, where freezing of MultiXactIds was introduced.

Analysis and patch by Andres Freund, with some tweaks by Ãlvaro.
---
 src/backend/access/heap/heapam.c       |  151 ++++++++++++++++++++++++++++----
 src/backend/access/transam/multixact.c |   11 ++-
 2 files changed, 140 insertions(+), 22 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d5560d2..2a3f872 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -5264,6 +5264,8 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * so we need no external state checks to decide what to do.  (This is good
  * because this function is applied during WAL recovery, when we don't have
  * access to any such state, and can't depend on the hint bits to be set.)
+ * There is an exception we make which is to assume GetMultiXactIdMembers can
+ * be called during recovery.
  *
  * Similarly, cutoff_multi must be less than or equal to the smallest
  * MultiXactId used by any transaction currently open.
@@ -5285,8 +5287,10 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 				  MultiXactId cutoff_multi)
 {
 	bool		changed = false;
+	bool		freeze_xmax = false;
 	TransactionId xid;
 
+	/* Process xmin */
 	xid = HeapTupleHeaderGetXmin(tuple);
 	if (TransactionIdIsNormal(xid) &&
 		TransactionIdPrecedes(xid, cutoff_xid))
@@ -5303,16 +5307,97 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 	}
 
 	/*
-	 * Note that this code handles IS_MULTI Xmax values, too, but only to mark
-	 * the tuple as not updated if the multixact is below the cutoff Multixact
-	 * given; it doesn't remove dead members of a very old multixact.
+	 * Process xmax.  To thoroughly examine the current Xmax value we need to
+	 * resolve a MultiXactId to its member Xids, in case some of them are
+	 * below the given cutoff for Xids.  In that case, those values might need
+	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
+	 * it out --- if there's a live updater Xid, it needs to be kept.
 	 */
 	xid = HeapTupleHeaderGetRawXmax(tuple);
-	if ((tuple->t_infomask & HEAP_XMAX_IS_MULTI) ?
-		(MultiXactIdIsValid(xid) &&
-		 MultiXactIdPrecedes(xid, cutoff_multi)) :
-		(TransactionIdIsNormal(xid) &&
-		 TransactionIdPrecedes(xid, cutoff_xid)))
+
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		;
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		/* We shouldn't get an invalid multi, but check to avoid bad mojo */
+		if (!MultiXactIdIsValid(xid))
+			freeze_xmax = true;
+		else if (MultiXactIdPrecedes(xid, cutoff_multi))
+		{
+			/*
+			 * This old multi cannot possibly be running.  If it was a locker
+			 * only, it can be removed without much further thought; but if it
+			 * contained an update, we need to preserve it.
+			 */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+				freeze_xmax = true;
+			else
+			{
+				TransactionId update_xid;
+
+				update_xid = HeapTupleGetUpdateXid(tuple);
+
+				/*
+				 * The multixact has an update hidden within.  Careful: if we
+				 * were to simply set freeze_max to true here, we'd end up
+				 * with both the old and new tuples being visible.  Handle
+				 * this by storing the update Xid directly in Xmax and
+				 * removing the IS_MULTI bit.
+				 *
+				 * Note the update Xid cannot possibly be older than
+				 * cutoff_xid; if it were, we wouldn't be here: if committed,
+				 * the tuple would have been pruned, and if aborted, the Xmax
+				 * would have been marked Invalid by HeapTupleSatisfiesVacuum.
+				 * (Not in-progress either, because then cutoff_xid would be
+				 * newer.)
+				 */
+				Assert(!TransactionIdPrecedes(update_xid, cutoff_xid));
+				Assert(InRecovery || !TransactionIdIsInProgress(update_xid));
+				tuple->t_infomask &= ~HEAP_XMAX_BITS;
+				HeapTupleHeaderSetXmax(tuple, update_xid);
+			}
+		}
+		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		{
+			/* newer than the cutoff, so don't touch it */
+			;
+		}
+		else
+		{
+			TransactionId	update_xid;
+
+			/*
+			 * This is a multixact which is not marked LOCK_ONLY, but which
+			 * is newer than the cutoff_multi.  If the update_xid is below the
+			 * cutoff_xid point, then we can just freeze the tuple removing
+			 * the Xmax.  This seems simple, but there are several underlying
+			 * assumptions:
+			 *
+			 * 1. There cannot possibly be any live locking members remaining
+			 * in the multixact.  This is because if they were alive, the
+			 * update's Xid would had been considered (via the lockers
+			 * snapshot's Xmin) as part the cutoff_xid.
+			 *
+			 * 2. We don't create new MultiXacts via MultiXactIdExpand() that
+			 * include a very old aborted update Xid: in that function we
+			 * check whether the update is committed or in-progress.
+			 *
+			 * 3. A tuple marked by an multixact containing a very old
+			 * committed update Xid would have been pruned away by vacuum; we
+			 * wouldn't be freezing this tuple at all.
+			 */
+			update_xid = HeapTupleGetUpdateXid(tuple);
+			if (TransactionIdPrecedes(update_xid, cutoff_xid))
+				freeze_xmax = true;
+		}
+	}
+	else if (TransactionIdIsNormal(xid) &&
+			 TransactionIdPrecedes(xid, cutoff_xid))
+	{
+		freeze_xmax = true;
+	}
+
+	if (freeze_xmax)
 	{
 		HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
 
@@ -5644,24 +5729,54 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
 		TransactionIdPrecedes(xid, cutoff_xid))
 		return true;
 
-	if (!(tuple->t_infomask & HEAP_XMAX_INVALID))
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		;
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+		MultiXactId multi;
+
+		multi = HeapTupleHeaderGetRawXmax(tuple);
+		if (MultiXactIdPrecedes(multi, cutoff_multi))
+			return true;
+		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 		{
-			xid = HeapTupleHeaderGetRawXmax(tuple);
-			if (TransactionIdIsNormal(xid) &&
-				TransactionIdPrecedes(xid, cutoff_xid))
-				return true;
+			/* only-locker multis don't need internal examination */
+			;
 		}
 		else
 		{
-			MultiXactId multi;
+			MultiXactMember *members;
+			int			nmembers;
+			int			i;
 
-			multi = HeapTupleHeaderGetRawXmax(tuple);
-			if (MultiXactIdPrecedes(multi, cutoff_multi))
-				return true;
+			nmembers = GetMultiXactIdMembers(xid, &members, true);
+			for (i = 0; i < nmembers; i++)
+			{
+				TransactionId member = members[i].xid;
+
+				Assert(TransactionIdIsNormal(member));
+
+				/* we don't care about lockers */
+				if (!ISUPDATE_from_mxstatus(members[i].status))
+					continue;
+
+				if (TransactionIdPrecedes(member, cutoff_xid))
+				{
+					pfree(members);
+					return true;
+				}
+			}
+			if (members)
+				pfree(members);
 		}
 	}
+	else
+	{
+		xid = HeapTupleHeaderGetRawXmax(tuple);
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, cutoff_xid))
+			return true;
+	}
 
 	if (tuple->t_infomask & HEAP_MOVED)
 	{
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index ca702ee..ff1621c 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -434,11 +434,14 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 	 * Determine which of the members of the MultiXactId are still of
 	 * interest. This is any running transaction, and also any transaction
 	 * that grabbed something stronger than just a lock and was committed. (An
-	 * update that aborted is of no interest here.)
+	 * update that aborted is of no interest here; and having more than one
+	 * update Xid in a multixact would cause errors elsewhere.)
 	 *
-	 * (Removing dead members is just an optimization, but a useful one. Note
-	 * we have the same race condition here as above: j could be 0 at the end
-	 * of the loop.)
+	 * Removing dead members is not just an optimization: freezing of tuples
+	 * whose Xmax are multis depends on this behavior.
+	 *
+	 * Note we have the same race condition here as above: j could be 0 at the
+	 * end of the loop.
 	 */
 	newMembers = (MultiXactMember *)
 		palloc(sizeof(MultiXactMember) * (nmembers + 1));
-- 
1.7.10.4

