>From c55c3c4a427fbf8a869260f7a49a24d3efbfd6b2 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 28 Nov 2013 19:17:21 -0300
Subject: [PATCH 4/4] Fix a couple of bugs in MultiXactId freezing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look
into a multixact to check the members against cutoff_xid.  This means
that a very old Xid could survive hidden within a multi, possibly
outliving its CLOG storage.  In the distant future, this would cause
clog lookup failures:
ERROR:  could not access status of transaction 3883960912
DETAIL:  Could not open file "pg_clog/0E78": No such file or directory.

This mostly was problematic when the updating transaction aborted, since
in that case the row wouldn't get pruned away earlier in vacuum and the
multixact could possibly survive for a long time.  In many cases, data
that is inaccessible for this reason way can be brought back
heuristically.

As a second bug, heap_freeze_tuple() didn't properly handle multixacts
that need to be frozen according to cutoff_multi, but whose updater xid
is still alive.  Instead of preserving the update Xid, it just set Xmax
invalid, which leads to both old and new tuple versions becoming
visible.  This is pretty rare in practice, but a real threat
nonetheless.  Existing corrupted rows, unfortunately, cannot be repaired
in an automated fashion.

Following code analysis caused by bug report by J Smith in message
CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com
and privately by F-Secure.

Backpatch to 9.3, where freezing of MultiXactIds was introduced.

Analysis and patch by Andres Freund, with some tweaks by Ãlvaro.
---
 src/backend/access/heap/heapam.c       |  183 ++++++++++++++++++++++++++++----
 src/backend/access/transam/multixact.c |    4 +
 2 files changed, 169 insertions(+), 18 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f35afdd..ce12f41 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -5260,6 +5260,8 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * so we need no external state checks to decide what to do.  (This is good
  * because this function is applied during WAL recovery, when we don't have
  * access to any such state, and can't depend on the hint bits to be set.)
+ * There is an exception we make which is to assume GetMultiXactIdMembers can
+ * be called during recovery.
  *
  * Similarly, cutoff_multi must be less than or equal to the smallest
  * MultiXactId used by any transaction currently open.
@@ -5281,8 +5283,10 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 				  MultiXactId cutoff_multi)
 {
 	bool		changed = false;
+	bool		freeze_xmax = false;
 	TransactionId xid;
 
+	/* Process xmin */
 	xid = HeapTupleHeaderGetXmin(tuple);
 	if (TransactionIdIsNormal(xid) &&
 		TransactionIdPrecedes(xid, cutoff_xid))
@@ -5299,16 +5303,130 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
 	}
 
 	/*
-	 * Note that this code handles IS_MULTI Xmax values, too, but only to mark
-	 * the tuple as not updated if the multixact is below the cutoff Multixact
-	 * given; it doesn't remove dead members of a very old multixact.
+	 * Process xmax.  To thoroughly examine the current Xmax value we need to
+	 * resolve a MultiXactId to its member Xids, in case some of them are below
+	 * the given cutoff for Xids.  In that case, those values might need
+	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
+	 * it out --- if there's a live updater Xid, it needs to be kept.
 	 */
 	xid = HeapTupleHeaderGetRawXmax(tuple);
-	if ((tuple->t_infomask & HEAP_XMAX_IS_MULTI) ?
-		(MultiXactIdIsValid(xid) &&
-		 MultiXactIdPrecedes(xid, cutoff_multi)) :
-		(TransactionIdIsNormal(xid) &&
-		 TransactionIdPrecedes(xid, cutoff_xid)))
+
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		;
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		/* We shouldn't get an invalid multi, but check to avoid bad mojo */
+		if (!MultiXactIdIsValid(xid))
+			freeze_xmax = true;
+		else if (MultiXactIdPrecedes(xid, cutoff_multi))
+		{
+			/*
+			 * An old multi.  If it was a locker only, it can be removed
+			 * without much further thought; but if it contained an update,
+			 * we need to examine its status.
+			 */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+				freeze_xmax = true;
+			else
+			{
+				TransactionId	update_xid;
+
+				/*
+				 * FIXME this calls TransactionIdDidAbort() internally,
+				 * falsifying the claim in the comment at the top ...
+				 */
+				update_xid = HeapTupleGetUpdateXid(tuple);
+
+				/*
+				 * XXX we rely here on HeapTupleGetUpdateXid returning
+				 * Invalid for aborted updates.
+				 */
+				if (!TransactionIdIsValid(update_xid))
+					freeze_xmax = true;
+				else
+				{
+					/*
+					 * A committed or in-progress update.  Careful: if we were
+					 * to simply set freeze_max to true here, we'd end up with
+					 * both the old and new tuples being visible.  Handle this
+					 * by storing the update Xid directly in Xmax and removing
+					 * the IS_MULTI bit.
+					 */
+					Assert(InRecovery || !TransactionIdIsInProgress(cutoff_multi));
+					tuple->t_infomask &= ~HEAP_XMAX_BITS;
+					HeapTupleHeaderSetXmax(tuple, update_xid);
+				}
+			}
+		}
+		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		{
+			/* newer than the cutoff, so don't touch it */
+			;
+		}
+		else
+		{
+			MultiXactMember *members;
+			int			nmembers;
+			int			i;
+
+			/*
+			 * Need to check whether any members of the multi are below
+			 * cutoff_xid.  Otherwise, an Xid could survive truncation of its
+			 * CLOG segment, and we'd get lookup failures when this multi is
+			 * resolved in the future.
+			 */
+			nmembers = GetMultiXactIdMembers(xid, &members, true);
+			for (i = 0; i < nmembers; i++)
+			{
+				Assert(TransactionIdIsNormal(members[i].xid));
+
+				/*
+				 * Ignore pure lockers; those are protected against being read
+				 * in dangerous situations via OldestVisibleMXactId.
+				 */
+				if (!ISUPDATE_from_mxstatus(members[i].status))
+					continue;
+
+				/*
+				 * Found the update in this multi.  If it's newer than the
+				 * cutoff for Xids, we needn't do anything; but if it's older,
+				 * then we need to get rid of it.  Note that the update cannot
+				 * possibly be a committed transaction (because the tuple
+				 * should have been removed long ago in that case) or an
+				 * in-progress transaction (because it couldn't be older than
+				 * the cutoff_xid).  So it must be an aborted update, and thus
+				 * it can be removed.
+				 *
+				 * FIXME -- what if it's a committed update which has recent
+				 * new locker transaction?  The tuple wouldn't have been
+				 * removed in that case (HeapTupleSatisfiesVacuum returns
+				 * RECENTLY_DEAD).
+				 */
+				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
+				{
+					/* only one updater is allowed */
+					Assert(!freeze_xmax);
+
+					/*
+					 * If below the cutoff_xid, it should have been pruned away
+					 * earlier.
+					 */
+					Assert(InRecovery || TransactionIdDidAbort(members[i].xid));
+
+					freeze_xmax = true;
+				}
+			}
+			if (members)
+				pfree(members);
+		}
+	}
+	else if (TransactionIdIsNormal(xid) &&
+			 TransactionIdPrecedes(xid, cutoff_xid))
+	{
+		freeze_xmax = true;
+	}
+
+	if (freeze_xmax)
 	{
 		HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
 
@@ -5645,24 +5763,53 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
 		TransactionIdPrecedes(xid, cutoff_xid))
 		return true;
 
-	if (!(tuple->t_infomask & HEAP_XMAX_INVALID))
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		;
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+		MultiXactId multi;
+
+		multi = HeapTupleHeaderGetRawXmax(tuple);
+		if (MultiXactIdPrecedes(multi, cutoff_multi))
+			return true;
+		else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 		{
-			xid = HeapTupleHeaderGetRawXmax(tuple);
-			if (TransactionIdIsNormal(xid) &&
-				TransactionIdPrecedes(xid, cutoff_xid))
-				return true;
+			/* only-locker multis don't need internal examination */
+			;
 		}
 		else
 		{
-			MultiXactId multi;
+			MultiXactMember *members;
+			int			nmembers;
+			int			i;
 
-			multi = HeapTupleHeaderGetRawXmax(tuple);
-			if (MultiXactIdPrecedes(multi, cutoff_multi))
-				return true;
+			nmembers = GetMultiXactIdMembers(xid, &members, true);
+			for (i = 0; i < nmembers; i++)
+			{
+				TransactionId member = members[i].xid;
+				Assert(TransactionIdIsNormal(member));
+
+				/* we don't care about lockers */
+				if (ISUPDATE_from_mxstatus(members[i].status))
+					continue;
+
+				if (TransactionIdPrecedes(member, cutoff_xid))
+				{
+					pfree(members);
+					return true;
+				}
+			}
+			if (members)
+				pfree(members);
 		}
 	}
+	else
+	{
+		xid = HeapTupleHeaderGetRawXmax(tuple);
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, cutoff_xid))
+			return true;
+	}
 
 	if (tuple->t_infomask & HEAP_MOVED)
 	{
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index c389bf3..518c22d 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -445,6 +445,10 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 
 	for (i = 0, j = 0; i < nmembers; i++)
 	{
+		/*
+		 * FIXME: is it possible that we copy over too old updater xids
+		 * here?
+		 */
 		if (TransactionIdIsInProgress(members[i].xid) ||
 			((members[i].status > MultiXactStatusForUpdate) &&
 			 TransactionIdDidCommit(members[i].xid)))
-- 
1.7.10.4

