diff --git a/contrib/pgrowlocks/Makefile b/contrib/pgrowlocks/Makefile
index f56389b..fe80423 100644
--- a/contrib/pgrowlocks/Makefile
+++ b/contrib/pgrowlocks/Makefile
@@ -4,7 +4,7 @@ MODULE_big	= pgrowlocks
 OBJS		= pgrowlocks.o
 
 EXTENSION = pgrowlocks
-DATA = pgrowlocks--1.0.sql pgrowlocks--unpackaged--1.0.sql
+DATA = pgrowlocks--1.1.sql pgrowlocks--1.0--1.1.sql pgrowlocks--unpackaged--1.0.sql
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
new file mode 100644
index 0000000..70f20c7
--- /dev/null
+++ b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
@@ -0,0 +1,18 @@
+/* contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
+
+ALTER EXTENSION pgrowlocks DROP FUNCTION pgrowlocks(text);
+DROP FUNCTION pgrowlocks(text);
+CREATE FUNCTION pgrowlocks(IN relname text,
+    OUT locked_row TID,		-- row TID
+    OUT lock_type TEXT,		-- lock type
+    OUT locker XID,		-- locking XID
+    OUT multi bool,		-- multi XID?
+    OUT xids xid[],		-- multi XIDs
+    OUT modes text[],		-- multi XID statuses
+    OUT pids INTEGER[])		-- locker's process id
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgrowlocks'
+LANGUAGE C STRICT;
diff --git a/contrib/pgrowlocks/pgrowlocks--1.0.sql b/contrib/pgrowlocks/pgrowlocks--1.0.sql
deleted file mode 100644
index a909b74..0000000
--- a/contrib/pgrowlocks/pgrowlocks--1.0.sql
+++ /dev/null
@@ -1,15 +0,0 @@
-/* contrib/pgrowlocks/pgrowlocks--1.0.sql */
-
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
-
-CREATE FUNCTION pgrowlocks(IN relname text,
-    OUT locked_row TID,		-- row TID
-    OUT lock_type TEXT,		-- lock type
-    OUT locker XID,		-- locking XID
-    OUT multi bool,		-- multi XID?
-    OUT xids xid[],		-- multi XIDs
-    OUT pids INTEGER[])		-- locker's process id
-RETURNS SETOF record
-AS 'MODULE_PATHNAME', 'pgrowlocks'
-LANGUAGE C STRICT;
diff --git a/contrib/pgrowlocks/pgrowlocks--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.1.sql
new file mode 100644
index 0000000..924d80f
--- /dev/null
+++ b/contrib/pgrowlocks/pgrowlocks--1.1.sql
@@ -0,0 +1,16 @@
+/* contrib/pgrowlocks/pgrowlocks--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
+
+CREATE FUNCTION pgrowlocks(IN relname text,
+    OUT locked_row TID,		-- row TID
+    OUT lock_type TEXT,		-- lock type
+    OUT locker XID,		-- locking XID
+    OUT multi bool,		-- multi XID?
+    OUT xids xid[],		-- multi XIDs
+    OUT modes text[],		-- multi XID statuses
+    OUT pids INTEGER[])		-- locker's process id
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgrowlocks'
+LANGUAGE C STRICT;
diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c
index 20beed2..170547a 100644
--- a/contrib/pgrowlocks/pgrowlocks.c
+++ b/contrib/pgrowlocks/pgrowlocks.c
@@ -59,6 +59,14 @@ typedef struct
 	int			ncolumns;
 } MyData;
 
+#define		Atnum_tid		0
+#define		Atnum_type		1
+#define		Atnum_xmax		2
+#define		Atnum_ismulti	3
+#define		Atnum_xids		4
+#define		Atnum_modes		5
+#define		Atnum_pids		6
+
 Datum
 pgrowlocks(PG_FUNCTION_ARGS)
 {
@@ -124,72 +132,96 @@ pgrowlocks(PG_FUNCTION_ARGS)
 									 GetCurrentCommandId(false),
 									 scan->rs_cbuf) == HeapTupleBeingUpdated)
 		{
-
 			char	  **values;
-			int			i;
 
 			values = (char **) palloc(mydata->ncolumns * sizeof(char *));
 
-			i = 0;
-			values[i++] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self));
+			values[Atnum_tid] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self));
 
-			if (tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK)
-				values[i++] = pstrdup("Shared");
-			else
-				values[i++] = pstrdup("Exclusive");
-			values[i] = palloc(NCHARS * sizeof(char));
-			snprintf(values[i++], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data));
+			values[Atnum_type] = palloc(36);
+			values[Atnum_type][0] = '\0';
+			if (tuple->t_data->t_infomask & HEAP_XMAX_KEYSHR_LOCK)
+				strcat(values[Atnum_type], "KeyShare ");
+			if (tuple->t_data->t_infomask & HEAP_XMAX_EXCL_LOCK)
+				strcat(values[Atnum_type], "Exclusive ");
+			if (tuple->t_data->t_infomask & HEAP_XMAX_IS_NOT_UPDATE)
+				strcat(values[Atnum_type], "IsNotUpdate ");
+
+			values[Atnum_xmax] = palloc(NCHARS * sizeof(char));
+			snprintf(values[Atnum_xmax], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data));
 			if (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)
 			{
-				TransactionId *xids;
-				int			nxids;
+				MultiXactMember *members;
+				int			nmembers;
 				int			j;
-				int			isValidXid = 0;		/* any valid xid ever exists? */
+				bool		isValidXid = false;		/* any valid xid ever exists? */
 
-				values[i++] = pstrdup("true");
-				nxids = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &xids);
-				if (nxids == -1)
-				{
+				values[Atnum_ismulti] = pstrdup("true");
+
+				nmembers = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &members);
+				if (nmembers == -1)
 					elog(ERROR, "GetMultiXactIdMembers returns error");
-				}
 
-				values[i] = palloc(NCHARS * nxids);
-				values[i + 1] = palloc(NCHARS * nxids);
-				strcpy(values[i], "{");
-				strcpy(values[i + 1], "{");
+				values[Atnum_xids] = palloc(NCHARS * nmembers);
+				values[Atnum_modes] = palloc(NCHARS * nmembers);
+				values[Atnum_pids] = palloc(NCHARS * nmembers);
 
-				for (j = 0; j < nxids; j++)
+				strcpy(values[Atnum_xids], "{");
+				strcpy(values[Atnum_modes], "{");
+				strcpy(values[Atnum_pids], "{");
+
+				for (j = 0; j < nmembers; j++)
 				{
 					char		buf[NCHARS];
 
-					if (TransactionIdIsInProgress(xids[j]))
+					if (isValidXid)
 					{
-						if (isValidXid)
-						{
-							strcat(values[i], ",");
-							strcat(values[i + 1], ",");
-						}
-						snprintf(buf, NCHARS, "%d", xids[j]);
-						strcat(values[i], buf);
-						snprintf(buf, NCHARS, "%d", BackendXidGetPid(xids[j]));
-						strcat(values[i + 1], buf);
-
-						isValidXid = 1;
+						strcat(values[Atnum_xids], ",");
+						strcat(values[Atnum_modes], ",");
+						strcat(values[Atnum_pids], ",");
+					}
+					snprintf(buf, NCHARS, "%d", members[j].xid);
+					strcat(values[Atnum_xids], buf);
+					switch (members[j].status)
+					{
+						case MultiXactStatusKeyUpdate:
+							snprintf(buf, NCHARS, "keyupd");
+							break;
+						case MultiXactStatusUpdate:
+							snprintf(buf, NCHARS, "upd");
+							break;
+						case MultiXactStatusForUpdate:
+							snprintf(buf, NCHARS, "forupd");
+							break;
+						case MultiXactStatusForShare:
+							snprintf(buf, NCHARS, "shr");
+							break;
+						case MultiXactStatusForKeyShare:
+							snprintf(buf, NCHARS, "keyshr");
+							break;
 					}
+					strcat(values[Atnum_modes], buf);
+					snprintf(buf, NCHARS, "%d", BackendXidGetPid(members[j].xid));
+					strcat(values[Atnum_pids], buf);
+
+					isValidXid = true;
 				}
 
-				strcat(values[i], "}");
-				strcat(values[i + 1], "}");
-				i++;
+				strcat(values[Atnum_xids], "}");
+				strcat(values[Atnum_modes], "}");
+				strcat(values[Atnum_pids], "}");
 			}
 			else
 			{
-				values[i++] = pstrdup("false");
-				values[i] = palloc(NCHARS * sizeof(char));
-				snprintf(values[i++], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data));
+				values[Atnum_ismulti] = pstrdup("false");
+
+				values[Atnum_xids] = palloc(NCHARS * sizeof(char));
+				snprintf(values[Atnum_xids], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data));
+
+				values[Atnum_modes] = NULL;
 
-				values[i] = palloc(NCHARS * sizeof(char));
-				snprintf(values[i++], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data)));
+				values[Atnum_pids] = palloc(NCHARS * sizeof(char));
+				snprintf(values[Atnum_pids], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data)));
 			}
 
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
@@ -200,10 +232,10 @@ pgrowlocks(PG_FUNCTION_ARGS)
 			/* make the tuple into a datum */
 			result = HeapTupleGetDatum(tuple);
 
-			/* Clean up */
-			for (i = 0; i < mydata->ncolumns; i++)
-				pfree(values[i]);
-			pfree(values);
+			/*
+			 * no need to pfree what we allocated; it's on a short-lived memory
+			 * context anyway
+			 */
 
 			SRF_RETURN_NEXT(funcctx, result);
 		}
diff --git a/contrib/pgrowlocks/pgrowlocks.control b/contrib/pgrowlocks/pgrowlocks.control
index a6ba164..dfa587d 100644
--- a/contrib/pgrowlocks/pgrowlocks.control
+++ b/contrib/pgrowlocks/pgrowlocks.control
@@ -1,5 +1,5 @@
 # pgrowlocks extension
 comment = 'show row-level locking information'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/pgrowlocks'
 relocatable = true
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b2d1901..42d14a2 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -74,6 +74,7 @@
 bool		synchronize_seqscans = true;
 
 
+static LOCKMODE get_lockmode_for_tuplelock(LockTupleMode mode);
 static HeapScanDesc heap_beginscan_internal(Relation relation,
 						Snapshot snapshot,
 						int nkeys, ScanKey key,
@@ -84,6 +85,7 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 				bool all_visible_cleared, bool new_all_visible_cleared);
 static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
 					   HeapTuple oldtup, HeapTuple newtup);
+static uint16 GetMultiXactIdHintBits(MultiXactId multi);
 
 
 /* ----------------------------------------------------------------
@@ -1620,7 +1622,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				   ItemPointerGetBlockNumber(tid));
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
 			at_chain_start = false;
-			prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+			prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
 		}
 		else
 			break;				/* end of chain */
@@ -1743,7 +1745,7 @@ heap_get_latest_tid(Relation relation,
 		 * tuple.  Check for XMIN match.
 		 */
 		if (TransactionIdIsValid(priorXmax) &&
-		  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+			!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
 		{
 			UnlockReleaseBuffer(buffer);
 			break;
@@ -1761,7 +1763,8 @@ heap_get_latest_tid(Relation relation,
 		/*
 		 * If there's a valid t_ctid link, follow it, else we're done.
 		 */
-		if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
+		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HeapTupleHeaderIsLocked(tp.t_data) ||
 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
 		{
 			UnlockReleaseBuffer(buffer);
@@ -1769,7 +1772,7 @@ heap_get_latest_tid(Relation relation,
 		}
 
 		ctid = tp.t_data->t_ctid;
-		priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
 		UnlockReleaseBuffer(buffer);
 	}							/* end of loop */
 }
@@ -2085,10 +2088,11 @@ simple_heap_insert(Relation relation, HeapTuple tup)
  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
  * (the last only possible if wait == false).
  *
- * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * In the failure cases, the routine returns the tuple's t_ctid and the
+ * updating Xid (resolving a possible MultiXact, if necessary).
  * If t_ctid is the same as tid, the tuple was deleted; if different, the
  * tuple was updated, and t_ctid is the location of the replacement tuple.
- * (t_xmax is needed to verify that the replacement tuple matches.)
+ * (xmax is needed to verify that the replacement tuple matches.)
  */
 HTSU_Result
 heap_delete(Relation relation, ItemPointer tid,
@@ -2174,20 +2178,22 @@ l1:
 		 */
 		if (!have_tuple_lock)
 		{
-			LockTuple(relation, &(tp.t_self), ExclusiveLock);
+			LockTuple(relation, &(tp.t_self),
+					  get_lockmode_for_tuplelock(LockTupleKeyUpdate));
 			have_tuple_lock = true;
 		}
 
 		/*
 		 * Sleep until concurrent transaction ends.  Note that we don't care
-		 * if the locker has an exclusive or shared lock, because we need
-		 * exclusive.
+		 * which lock mode the locker has, because we need the strongest one.
 		 */
 
 		if (infomask & HEAP_XMAX_IS_MULTI)
 		{
+			int		remain;
+
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait);
+			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusKeyUpdate, &remain);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2234,8 +2240,8 @@ l1:
 		 * We may overwrite if previous xmax aborted, or if it committed but
 		 * only locked the tuple without updating it.
 		 */
-		if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
-									 HEAP_IS_LOCKED))
+		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HeapTupleHeaderIsLocked(tp.t_data))
 			result = HeapTupleMayBeUpdated;
 		else
 			result = HeapTupleUpdated;
@@ -2255,10 +2261,11 @@ l1:
 			   result == HeapTupleBeingUpdated);
 		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
 		*ctid = tp.t_data->t_ctid;
-		*update_xmax = HeapTupleHeaderGetXmax(tp.t_data);
+		*update_xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
-			UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+			UnlockTuple(relation, &(tp.t_self),
+						get_lockmode_for_tuplelock(LockTupleKeyUpdate));
 		if (vmbuffer != InvalidBuffer)
 			ReleaseBuffer(vmbuffer);
 		return result;
@@ -2296,7 +2303,7 @@ l1:
 	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 							   HEAP_XMAX_INVALID |
 							   HEAP_XMAX_IS_MULTI |
-							   HEAP_IS_LOCKED |
+							   HEAP_LOCK_BITS |
 							   HEAP_MOVED);
 	HeapTupleHeaderClearHotUpdated(tp.t_data);
 	HeapTupleHeaderSetXmax(tp.t_data, xid);
@@ -2368,7 +2375,8 @@ l1:
 	 * Release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+		UnlockTuple(relation, &(tp.t_self),
+					get_lockmode_for_tuplelock(LockTupleKeyUpdate));
 
 	pgstat_count_heap_delete(relation);
 
@@ -2442,10 +2450,11 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
- * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * In the failure cases, the routine returns the tuple's t_ctid and the
+ * updating Xid (resolving a possible MultiXact, if necessary).
  * If t_ctid is the same as otid, the tuple was deleted; if different, the
  * tuple was updated, and t_ctid is the location of the replacement tuple.
- * (t_xmax is needed to verify that the replacement tuple matches.)
+ * (xmax is needed to verify that the replacement tuple matches.)
  */
 HTSU_Result
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
@@ -2455,11 +2464,14 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
 	Bitmapset  *hot_attrs;
+	Bitmapset  *key_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
 	Page		page;
 	BlockNumber	block;
+	LockTupleMode tuplock;
+	MultiXactStatus mxact_status;
 	Buffer		buffer,
 				newbuf,
 				vmbuffer = InvalidBuffer,
@@ -2471,8 +2483,14 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	bool		have_tuple_lock = false;
 	bool		iscombo;
 	bool		use_hot_update = false;
+	bool		key_intact;
 	bool		all_visible_cleared = false;
 	bool		all_visible_cleared_new = false;
+	bool			keep_xmax_multi = false;
+	TransactionId	keep_xmax = InvalidTransactionId;
+	TransactionId	keep_xmax_old = InvalidTransactionId;
+	uint16		keep_xmax_infomask = 0;
+	uint16		keep_xmax_old_infomask = 0;
 
 	Assert(ItemPointerIsValid(otid));
 
@@ -2488,7 +2506,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 * Note that we get a copy here, so we need not worry about relcache flush
 	 * happening midway through.
 	 */
-	hot_attrs = RelationGetIndexAttrBitmap(relation);
+	hot_attrs = RelationGetIndexAttrBitmap(relation, false);
+	key_attrs = RelationGetIndexAttrBitmap(relation, true);
 
 	block = ItemPointerGetBlockNumber(otid);
 	buffer = ReadBuffer(relation, block);
@@ -2513,6 +2532,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	oldtup.t_self = *otid;
 
 	/*
+	 * If we're not updating any "key" column, we can grab a milder lock type.
+	 * This allows for more concurrency when we are running simultaneously with
+	 * foreign key checks.
+	 */
+	if (HeapSatisfiesHOTUpdate(relation, key_attrs, &oldtup, newtup))
+	{
+		tuplock = LockTupleUpdate;
+		mxact_status = MultiXactStatusUpdate;
+		key_intact = true;
+	}
+	else
+	{
+		tuplock = LockTupleKeyUpdate;
+		mxact_status = MultiXactStatusKeyUpdate;
+		key_intact = false;
+	}
+
+	/*
 	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
 	 * otid may very well point at newtup->t_self, which we will overwrite
 	 * with the new tuple's location, so there's great risk of confusion if we
@@ -2522,6 +2559,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 l2:
 	result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
 
+	/* see below about the "no wait" case */
+	Assert(result != HeapTupleBeingUpdated || wait);
+
 	if (result == HeapTupleInvisible)
 	{
 		UnlockReleaseBuffer(buffer);
@@ -2529,8 +2569,21 @@ l2:
 	}
 	else if (result == HeapTupleBeingUpdated && wait)
 	{
-		TransactionId xwait;
+		TransactionId	xwait;
 		uint16		infomask;
+		bool		none_remain = false;
+
+		/*
+		 * XXX note that we don't consider the "no wait" case here.  This
+		 * isn't a problem currently because no caller uses that case, but it
+		 * should be fixed if such a caller is introduced.  It wasn't a problem
+		 * previously because this code would always wait, but now that some
+		 * tuple locks do not conflict with one of the lock modes we use, it is
+		 * possible that this case is interesting to handle specially.
+		 *
+		 * This may cause failures with third-party code that calls heap_update
+		 * directly.
+		 */
 
 		/* must copy state data before unlocking buffer */
 		xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
@@ -2549,20 +2602,26 @@ l2:
 		 */
 		if (!have_tuple_lock)
 		{
-			LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+			LockTuple(relation, &(oldtup.t_self),
+					  get_lockmode_for_tuplelock(tuplock));
 			have_tuple_lock = true;
 		}
 
 		/*
-		 * Sleep until concurrent transaction ends.  Note that we don't care
-		 * if the locker has an exclusive or shared lock, because we need
-		 * exclusive.
+		 * Now sleep on the locker.  Note that if there are only key-share
+		 * lockers and we're not updating the key columns, we will be awaken
+		 * before it is gone, so we may need to mark the new tuple with a
+		 * new MultiXactId including the original xmax and ourselves.
+		 *
+		 * XXX this comment needs to be more comprehensive
 		 */
-
 		if (infomask & HEAP_XMAX_IS_MULTI)
 		{
+			TransactionId	update_xact;
+			int				remain;
+
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait);
+			MultiXactIdWait((MultiXactId) xwait, mxact_status, &remain);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2576,41 +2635,98 @@ l2:
 				goto l2;
 
 			/*
-			 * You might think the multixact is necessarily done here, but not
-			 * so: it could have surviving members, namely our own xact or
-			 * other subxacts of this backend.	It is legal for us to update
-			 * the tuple in either case, however (the latter case is
-			 * essentially a situation of upgrading our former shared lock to
-			 * exclusive).	We don't bother changing the on-disk hint bits
-			 * since we are about to overwrite the xmax altogether.
+			 * Note that the multixact may not be done by now.  It could have
+			 * surviving members; our own xact or other subxacts of this
+			 * backend, and also any other concurrent transaction that locked
+			 * the tuple with KeyShare if we only got TupleLockUpdate.  If this
+			 * is the case, we have to be careful to mark the updated tuple
+			 * with the surviving members in Xmax.
+			 *
+			 * Note that there could have been another update in the MultiXact.
+			 * In that case, we need to check whether it committed or aborted.
+			 * If it aborted we are safe to update it again; otherwise there is
+			 * an update conflict that must be handled below.
+			 *
+			 * In the LockTupleKeyUpdate case, we still need to preserve the
+			 * surviving members: those would include the tuple locks we had
+			 * before this one, which are important to keep in case this
+			 * subxact aborts.
 			 */
+			update_xact = InvalidTransactionId;
+			if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_NOT_UPDATE))
+				update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
+
+			/* there was no UPDATE in the MultiXact; or it aborted. */
+			if (update_xact == InvalidTransactionId ||
+				TransactionIdDidAbort(update_xact))
+			{
+				/*
+				 * if the multixact still has live members, we need to preserve
+				 * it by creating a new multixact.  If all members are gone, we
+				 * can simply update the tuple by setting ourselves in Xmax.
+				 */
+				if (remain > 0)
+				{
+					keep_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
+					keep_xmax_multi =
+						(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0;
+				}
+				else
+				{
+					/*
+					 * We could set the HEAP_XMAX_INVALID bit here instead of
+					 * using a separate boolean flag.  However, since we're going
+					 * to set up a new xmax below, this would waste time
+					 * setting up the buffer's dirty bit.
+					 */
+					none_remain = false;
+				}
+			}
 		}
 		else
 		{
-			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
-			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
 			/*
-			 * xwait is done, but if xwait had just locked the tuple then some
-			 * other xact could update this tuple before we get to this point.
-			 * Check for xmax change, and start over if so.
+			 * If it's just a key-share locker, and we're not changing the
+			 * key columns, we don't need to wait for it to wait; but we
+			 * need to preserve it as locker.
 			 */
-			if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
-									 xwait))
-				goto l2;
+			if ((oldtup.t_data->t_infomask & HEAP_XMAX_KEYSHR_LOCK) &&
+				key_intact)
+			{
+				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+				keep_xmax = xwait;
+				keep_xmax_multi = false;
+			}
+			else
+			{
+				/* wait for regular transaction to end */
+				XactLockTableWait(xwait);
+				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
-			/* Otherwise check if it committed or aborted */
-			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+				/*
+				 * xwait is done, but if xwait had just locked the tuple then some
+				 * other xact could update this tuple before we get to this point.
+				 * Check for xmax change, and start over if so.
+				 */
+				if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
+										 xwait))
+					goto l2;
+
+				/* Otherwise check if it committed or aborted */
+				UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+			}
 		}
 
 		/*
 		 * We may overwrite if previous xmax aborted, or if it committed but
-		 * only locked the tuple without updating it.
+		 * only locked the tuple without updating it, or if we are going to
+		 * keep it around in Xmax.
 		 */
-		if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
-										 HEAP_IS_LOCKED))
+		if (TransactionIdIsValid(keep_xmax) ||
+			none_remain ||
+			(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HeapTupleHeaderIsLocked(oldtup.t_data))
 			result = HeapTupleMayBeUpdated;
 		else
 			result = HeapTupleUpdated;
@@ -2630,13 +2746,15 @@ l2:
 			   result == HeapTupleBeingUpdated);
 		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
 		*ctid = oldtup.t_data->t_ctid;
-		*update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
+		*update_xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
-			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+			UnlockTuple(relation, &(oldtup.t_self),
+						get_lockmode_for_tuplelock(tuplock));
 		if (vmbuffer != InvalidBuffer)
 			ReleaseBuffer(vmbuffer);
 		bms_free(hot_attrs);
+		bms_free(key_attrs);
 		return result;
 	}
 
@@ -2645,7 +2763,7 @@ l2:
 	 * visible while we were busy locking the buffer, or during some subsequent
 	 * window during which we had it unlocked, we'll have to unlock and
 	 * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
-	 * unfortunate, esepecially since we'll now have to recheck whether the
+	 * unfortunate, especially since we'll now have to recheck whether the
 	 * tuple has been locked or updated under us, but hopefully it won't
 	 * happen very often.
 	 */
@@ -2678,13 +2796,54 @@ l2:
 		Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
 	}
 
+	/*
+	 * If the tuple we're updating is locked, we need to preserve this in the
+	 * new tuple's Xmax as well as in the old tuple.  Prepare the new xmax
+	 * value for these uses.
+	 *
+	 * Note there cannot be an xmax to save if we're changing key columns; in
+	 * this case, the wait above should have only returned when the locking
+	 * transactions finished.
+	 */
+	if (TransactionIdIsValid(keep_xmax))
+	{
+		if (keep_xmax_multi)
+		{
+			keep_xmax_old = MultiXactIdExpand(keep_xmax,
+											  xid, MultiXactStatusUpdate);
+			keep_xmax_infomask = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_MULTI;
+		}
+		else
+		{
+			/* not a multi? must be a KEY SHARE locker */
+			keep_xmax_old = MultiXactIdCreate(keep_xmax, MultiXactStatusForKeyShare,
+											  xid, MultiXactStatusUpdate);
+			keep_xmax_infomask = HEAP_XMAX_KEYSHR_LOCK;
+		}
+		keep_xmax_old_infomask = HEAP_XMAX_IS_MULTI | HEAP_XMAX_KEYSHR_LOCK;
+		/* FIXME -- need more infomask bits? */
+	}
+
+	/*
+	 * Prepare the new tuple with the appropriate initial values of Xmin and
+	 * Xmax, as well as initial infomask bits.
+	 */
 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
 	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
-	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
+	newtup->t_data->t_infomask |= HEAP_UPDATED;
 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
-	HeapTupleHeaderSetXmax(newtup->t_data, 0);	/* for cleanliness */
 	newtup->t_tableOid = RelationGetRelid(relation);
+	if (TransactionIdIsValid(keep_xmax))
+	{
+		newtup->t_data->t_infomask |= keep_xmax_infomask;
+		HeapTupleHeaderSetXmax(newtup->t_data, keep_xmax);
+	}
+	else
+	{
+		newtup->t_data->t_infomask |= HEAP_XMAX_INVALID;
+		HeapTupleHeaderSetXmax(newtup->t_data, 0);	/* for cleanliness */
+	}
 
 	/*
 	 * Replace cid with a combo cid if necessary.  Note that we already put
@@ -2725,11 +2884,20 @@ l2:
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
-									   HEAP_IS_LOCKED |
+									   HEAP_LOCK_BITS |
 									   HEAP_MOVED);
+		oldtup.t_data->t_infomask2 &= ~HEAP_UPDATE_KEY_INTACT;
 		HeapTupleClearHotUpdated(&oldtup);
 		/* ... and store info about transaction updating this tuple */
-		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		if (TransactionIdIsValid(keep_xmax_old))
+		{
+			HeapTupleHeaderSetXmax(oldtup.t_data, keep_xmax_old);
+			oldtup.t_data->t_infomask |= keep_xmax_old_infomask;
+		}
+		else
+			HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		if (key_intact)
+			oldtup.t_data->t_infomask2 |= HEAP_UPDATE_KEY_INTACT;
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 		/* temporarily make it look not-updated */
 		oldtup.t_data->t_ctid = oldtup.t_self;
@@ -2883,10 +3051,19 @@ l2:
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
-									   HEAP_IS_LOCKED |
+									   HEAP_LOCK_BITS |
 									   HEAP_MOVED);
+		oldtup.t_data->t_infomask2 &= ~HEAP_UPDATE_KEY_INTACT;
 		/* ... and store info about transaction updating this tuple */
-		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		if (TransactionIdIsValid(keep_xmax_old))
+		{
+			HeapTupleHeaderSetXmax(oldtup.t_data, keep_xmax_old);
+			oldtup.t_data->t_infomask |= keep_xmax_old_infomask;
+		}
+		else
+			HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		if (key_intact)
+			oldtup.t_data->t_infomask2 |= HEAP_UPDATE_KEY_INTACT;
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 	}
 
@@ -2959,7 +3136,8 @@ l2:
 	 * Release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+		UnlockTuple(relation, &(oldtup.t_self),
+					get_lockmode_for_tuplelock(tuplock));
 
 	pgstat_count_heap_update(relation, use_hot_update);
 
@@ -2974,6 +3152,7 @@ l2:
 	}
 
 	bms_free(hot_attrs);
+	bms_free(key_attrs);
 
 	return HeapTupleMayBeUpdated;
 }
@@ -3129,6 +3308,54 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 }
 
 /*
+ * Return the appropriate LOCKMODE to acquire by LockTuple corresponding to the
+ * given lock tuple mode.
+ *
+ * These heavyweight lock modes have been chosen because they exactly mimic
+ * the lock conflict behavior that our tuple lock modes need to have.
+ */
+static LOCKMODE
+get_lockmode_for_tuplelock(LockTupleMode mode)
+{
+	switch (mode)
+	{
+		case LockTupleKeyShare:
+			return AccessShareLock;
+		case LockTupleShare:
+			return RowShareLock;
+		case LockTupleUpdate:
+			return ExclusiveLock;
+		case LockTupleKeyUpdate:
+			return AccessExclusiveLock;
+		default:
+			elog(ERROR, "invalid lock tuple mode %d", mode);
+			return 0;	/* keep compiler quiet */
+	}
+}
+
+/*
+ * Return the MultiXactStatus corresponding to the given tuple lock mode.
+ */
+static MultiXactStatus
+get_mxact_status_for_tuplelock(LockTupleMode mode)
+{
+	switch (mode)
+	{
+		case LockTupleKeyShare:
+			return MultiXactStatusForKeyShare;
+		case LockTupleShare:
+			return MultiXactStatusForShare;
+		case LockTupleUpdate:
+			return MultiXactStatusForUpdate;
+		case LockTupleKeyUpdate:
+			return MultiXactStatusUpdate;
+		default:
+			elog(ERROR, "invalid lock tuple mode %d", mode);
+			return 0;	/* keep compiler quiet */
+	}
+}
+
+/*
  *	heap_lock_tuple - lock a tuple in shared or exclusive mode
  *
  * Note that this acquires a buffer pin, which the caller must release.
@@ -3152,10 +3379,11 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  *	HeapTupleSelfUpdated: lock failed because tuple updated by self
  *	HeapTupleUpdated: lock failed because tuple updated by other xact
  *
- * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * In the failure cases, the routine returns the tuple's t_ctid and the
+ * updating Xid (resolving a possible MultiXact, if necessary).
  * If t_ctid is the same as t_self, the tuple was deleted; if different, the
  * tuple was updated, and t_ctid is the location of the replacement tuple.
- * (t_xmax is needed to verify that the replacement tuple matches.)
+ * (xmax is needed to verify that the replacement tuple matches.)
  *
  *
  * NOTES: because the shared-memory lock table is of finite size, but users
@@ -3201,13 +3429,13 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
 	Page		page;
 	TransactionId xid;
 	TransactionId xmax;
+	TransactionId keep_xmax = InvalidTransactionId;
+	bool		keep_xmax_multi = false;
+	bool		none_remains = false;
 	uint16		old_infomask;
 	uint16		new_infomask;
-	LOCKMODE	tuple_lock_type;
 	bool		have_tuple_lock = false;
 
-	tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
-
 	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
 	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -3220,6 +3448,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
 	tuple->t_tableOid = RelationGetRelid(relation);
 
 l3:
+	/* shouldn't get back here if we already set keep_xmax */
+	Assert(keep_xmax == InvalidTransactionId);
+
 	result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer);
 
 	if (result == HeapTupleInvisible)
@@ -3231,30 +3462,70 @@ l3:
 	{
 		TransactionId xwait;
 		uint16		infomask;
+		uint16		infomask2;
+		bool		require_sleep;
 
 		/* must copy state data before unlocking buffer */
 		xwait = HeapTupleHeaderGetXmax(tuple->t_data);
 		infomask = tuple->t_data->t_infomask;
+		infomask2 = tuple->t_data->t_infomask2;
 
 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
 
 		/*
-		 * If we wish to acquire share lock, and the tuple is already
-		 * share-locked by a multixact that includes any subtransaction of the
-		 * current top transaction, then we effectively hold the desired lock
-		 * already.  We *must* succeed without trying to take the tuple lock,
-		 * else we will deadlock against anyone waiting to acquire exclusive
-		 * lock.  We don't need to make any state changes in this case.
+		 * If we wish to acquire share or key lock, and the tuple is already
+		 * key or share locked by a multixact that includes any subtransaction
+		 * of the current top transaction, then we effectively hold the desired
+		 * lock already (except if we own key share lock and now desire share
+		 * lock).  We *must* succeed without trying to take the tuple lock,
+		 * else we will deadlock against anyone wanting to acquire a stronger
+		 * lock.
+		 *
+		 * FIXME -- we don't do the below currently, but I think we should:
+		 *
+		 * We update the Xmax with a new MultiXactId to include the new lock
+		 * mode in this case.
+		 *
+		 * Note that since we want to alter the Xmax, we need to re-acquire the
+		 * buffer lock.  The xmax could have changed in the meantime, so we
+		 * recheck it in that case, but we keep the buffer lock while doing it
+		 * to prevent starvation.  The second time around we know we must be
+		 * part of the MultiXactId in any case, which is why we don't need to
+		 * go back to recheck HeapTupleSatisfiesUpdate.  Also, after we
+		 * re-acquire lock, the MultiXact is likely to (but not necessarily) be
+		 * the same that we see here, so it should be in multixact's cache and
+		 * thus quick to obtain.
 		 */
-		if (mode == LockTupleShared &&
-			(infomask & HEAP_XMAX_IS_MULTI) &&
-			MultiXactIdIsCurrent((MultiXactId) xwait))
+		if ((infomask & HEAP_XMAX_IS_MULTI) &&
+			((mode == LockTupleShare) || (mode == LockTupleKeyShare)))
 		{
-			Assert(infomask & HEAP_XMAX_SHARED_LOCK);
-			/* Probably can't hold tuple lock here, but may as well check */
-			if (have_tuple_lock)
-				UnlockTuple(relation, tid, tuple_lock_type);
-			return HeapTupleMayBeUpdated;
+			int		i;
+			int		nmembers;
+			MultiXactMember *members;
+
+			nmembers = GetMultiXactIdMembers(xwait, &members);
+
+			for (i = 0; i < nmembers; i++)
+			{
+				if (TransactionIdIsCurrentTransactionId(members[i].xid))
+				{
+					if ((mode == LockTupleKeyShare) ||
+						((mode == LockTupleShare) &&
+						 (members[i].status >= MultiXactStatusForShare)))
+					{
+						if (have_tuple_lock)
+							UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode));
+						/*
+						 * FIXME -- here we should lock buffer, update xmax,
+						 * release buffer
+						 */
+						pfree(members);
+						return HeapTupleMayBeUpdated;
+					}
+				}
+			}
+
+			pfree(members);
 		}
 
 		/*
@@ -3270,105 +3541,240 @@ l3:
 		{
 			if (nowait)
 			{
-				if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
+				if (!ConditionalLockTuple(relation, tid, get_lockmode_for_tuplelock(mode)))
 					ereport(ERROR,
 							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 					errmsg("could not obtain lock on row in relation \"%s\"",
 						   RelationGetRelationName(relation))));
 			}
 			else
-				LockTuple(relation, tid, tuple_lock_type);
+				LockTuple(relation, tid, get_lockmode_for_tuplelock(mode));
 			have_tuple_lock = true;
 		}
 
-		if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
+		/*
+		 * If we're requesting KeyShare, and there's no update present, we
+		 * don't need to wait for locking transaction(s) to finish.  Even if
+		 * there is an update, we can still continue if the key hasn't been
+		 * modified.
+		 */
+		require_sleep = true;
+		if ((mode == LockTupleKeyShare) &&
+			(HeapTupleHeaderInfomaskIsLocked(infomask) ||
+			 infomask2 & HEAP_UPDATE_KEY_INTACT))
 		{
-			/*
-			 * Acquiring sharelock when there's at least one sharelocker
-			 * already.  We need not wait for him/them to complete.
-			 */
 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
-			 * Make sure it's still a shared lock, else start over.  (It's OK
-			 * if the ownership of the shared lock has changed, though.)
+			 * Make sure it's still an appropriate lock, else start over.
 			 */
-			if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
+			if (!(HeapTupleHeaderIsLocked(tuple->t_data) ||
+				  (tuple->t_data->t_infomask2 & HEAP_UPDATE_KEY_INTACT)))
 				goto l3;
+			require_sleep = false;
+			/* acquire fresh values -- XXX do we need to restart if xmax changed? */
+			keep_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
+			keep_xmax_multi = (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0;
 		}
-		else if (infomask & HEAP_XMAX_IS_MULTI)
-		{
-			/* wait for multixact to end */
-			if (nowait)
-			{
-				if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
-					ereport(ERROR,
-							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-					errmsg("could not obtain lock on row in relation \"%s\"",
-						   RelationGetRelationName(relation))));
-			}
-			else
-				MultiXactIdWait((MultiXactId) xwait);
 
+		/*
+		 * If we're requesting Share, we need to ensure there's no update
+		 * and no exclusive lock present.
+		 */
+		if (mode == LockTupleShare &&
+			(infomask & (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_NOT_UPDATE)) &&
+			!(infomask & HEAP_XMAX_EXCL_LOCK))
+		{
 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
-			 * If xwait had just locked the tuple then some other xact could
-			 * update this tuple before we get to this point. Check for xmax
-			 * change, and start over if so.
+			 * make sure it's still an appropriate lock, else start over.
 			 */
-			if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-									 xwait))
+			if (!(tuple->t_data->t_infomask &
+				  (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_NOT_UPDATE)) ||
+				(tuple->t_data->t_infomask & HEAP_XMAX_EXCL_LOCK))
 				goto l3;
+			require_sleep = false;
+			/* acquire fresh values */
+			keep_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
+			keep_xmax_multi = (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0;
+		}
 
-			/*
-			 * You might think the multixact is necessarily done here, but not
-			 * so: it could have surviving members, namely our own xact or
-			 * other subxacts of this backend.	It is legal for us to lock the
-			 * tuple in either case, however.  We don't bother changing the
-			 * on-disk hint bits since we are about to overwrite the xmax
-			 * altogether.
-			 */
+
+		/*
+		 * If our lock is Update, we might also be able to skip the sleep; for
+		 * this to be true, we need to ensure that there's no other lock type
+		 * than KeyShare.
+		 */
+		if (mode == LockTupleUpdate)
+		{
+			if (infomask & HEAP_XMAX_IS_MULTI)
+			{
+				int		nmembers;
+				MultiXactMember *members;
+
+				/*
+				 * This needs to be done the slow way: there might be
+				 * MultiXactStatusForShare locks hiding in there, and there's
+				 * no way to tell from just the hint bits.
+				 */
+				nmembers = GetMultiXactIdMembers(xwait, &members);
+				if (nmembers == 0)
+				{
+					require_sleep = false;
+					/*
+					 * No need to keep the previous xmax here. Unlikely to
+					 * happen anyway.
+					 */
+				}
+				else
+				{
+					int		i;
+					bool	allowed = true;
+
+					for (i = 0; i < nmembers; i++)
+					{
+						if (members[i].status != MultiXactStatusForKeyShare)
+						{
+							allowed = false;
+							break;
+						}
+					}
+					if (allowed)
+					{
+						/*
+						 * if the xmax changed under us in the meantime, start
+						 * over.
+						 */
+						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+						if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+							!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
+												 xwait))
+							goto l3;
+						/* otherwise, we're good */
+						require_sleep = false;
+						keep_xmax = xwait;
+						keep_xmax_multi = true;
+					}
+				}
+			}
+			else if (infomask & HEAP_XMAX_KEYSHR_LOCK)
+			{
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+				/* if the xmax changed in the meantime, start over */
+				if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
+										 xwait))
+					goto l3;
+				/* otherwise, we're good */
+				require_sleep = false;
+				keep_xmax = xwait;
+				keep_xmax_multi = false;
+			}
 		}
-		else
+
+		/*
+		 * By here, we either require to wait for the locking transaction or
+		 * multixact, or have already acquired the buffer exclusive lock.
+		 */
+
+		if (require_sleep)
 		{
-			/* wait for regular transaction to end */
-			if (nowait)
+			if (infomask & HEAP_XMAX_IS_MULTI)
 			{
-				if (!ConditionalXactLockTableWait(xwait))
-					ereport(ERROR,
-							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-					errmsg("could not obtain lock on row in relation \"%s\"",
-						   RelationGetRelationName(relation))));
+				MultiXactStatus status = get_mxact_status_for_tuplelock(mode);
+				int		remain;
+
+				/* We only ever lock tuples, never update them */
+				if (status >= MultiXactStatusUpdate)
+					elog(ERROR, "invalid lock mode in heap_tuple_lock");
+
+				/* wait for multixact to end */
+				if (nowait)
+				{
+					if (!ConditionalMultiXactIdWait((MultiXactId) xwait, status, &remain))
+						ereport(ERROR,
+								(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+								 errmsg("could not obtain lock on row in relation \"%s\"",
+										RelationGetRelationName(relation))));
+				}
+				else
+					MultiXactIdWait((MultiXactId) xwait, status, &remain);
+
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+				/*
+				 * If xwait had just locked the tuple then some other xact could
+				 * update this tuple before we get to this point. Check for xmax
+				 * change, and start over if so.
+				 */
+				if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
+										 xwait))
+					goto l3;
+
+				/*
+				 * Of course, the multixact might not be done here: if we're requesting
+				 * a light lock mode, other transactions with light locks could still
+				 * be alive, as well as locks owned by our own xact or other
+				 * subxacts of this backend.  We need to preserve the surviving
+				 * MultiXact members.  Note that it isn't absolutely necessary
+				 * in the latter case, but doing so is simpler.
+				 */
+				if (remain > 0)
+				{
+					keep_xmax = xwait;
+					keep_xmax_multi = true;
+				}
+				else
+					none_remains = true;
 			}
 			else
-				XactLockTableWait(xwait);
+			{
+				/* wait for regular transaction to end */
+				if (nowait)
+				{
+					if (!ConditionalXactLockTableWait(xwait))
+						ereport(ERROR,
+								(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+								 errmsg("could not obtain lock on row in relation \"%s\"",
+										RelationGetRelationName(relation))));
+				}
+				else
+					XactLockTableWait(xwait);
 
-			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
-			/*
-			 * xwait is done, but if xwait had just locked the tuple then some
-			 * other xact could update this tuple before we get to this point.
-			 * Check for xmax change, and start over if so.
-			 */
-			if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-									 xwait))
-				goto l3;
+				/*
+				 * xwait is done, but if xwait had just locked the tuple then
+				 * some other xact could update this tuple before we get to
+				 * this point.  Check for xmax change, and start over if so.
+				 */
+				if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
+										 xwait))
+					goto l3;
 
-			/* Otherwise check if it committed or aborted */
-			UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+				/*
+				 * Otherwise check if it committed or aborted.  Note we cannot
+				 * be here if the tuple was only locked by somebody who didn't
+				 * conflict with us; that should have been handled above.  So
+				 * that transaction must necessarily be gone by now.
+				 */
+				UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+			}
 		}
 
 		/*
 		 * We may lock if previous xmax aborted, or if it committed but only
-		 * locked the tuple without updating it.  The case where we didn't
-		 * wait because we are joining an existing shared lock is correctly
-		 * handled, too.
+		 * locked the tuple without updating it; or if we didn't have to wait
+		 * at all for whatever reason.
 		 */
-		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-										 HEAP_IS_LOCKED))
+		if (!require_sleep ||
+			(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HeapTupleHeaderIsLocked(tuple->t_data) ||
+			none_remains)
 			result = HeapTupleMayBeUpdated;
 		else
 			result = HeapTupleUpdated;
@@ -3379,10 +3785,10 @@ l3:
 		Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
 		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
 		*ctid = tuple->t_data->t_ctid;
-		*update_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
+		*update_xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
 		if (have_tuple_lock)
-			UnlockTuple(relation, tid, tuple_lock_type);
+			UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode));
 		return result;
 	}
 
@@ -3394,8 +3800,10 @@ l3:
 	 * for cases where it is a plain TransactionId.
 	 *
 	 * Note in particular that this covers the case where we already hold
-	 * exclusive lock on the tuple and the caller only wants shared lock. It
-	 * would certainly not do to give up the exclusive lock.
+	 * exclusive lock on the tuple and the caller only wants key share or share
+	 * lock. It would certainly not do to give up the exclusive lock.  Note
+	 * there's no explicit test for a share lock only; this was already covered
+	 * above, because it's only representable by a MultiXactId.
 	 */
 	xmax = HeapTupleHeaderGetXmax(tuple->t_data);
 	old_infomask = tuple->t_data->t_infomask;
@@ -3403,15 +3811,15 @@ l3:
 	if (!(old_infomask & (HEAP_XMAX_INVALID |
 						  HEAP_XMAX_COMMITTED |
 						  HEAP_XMAX_IS_MULTI)) &&
-		(mode == LockTupleShared ?
-		 (old_infomask & HEAP_IS_LOCKED) :
-		 (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
-		TransactionIdIsCurrentTransactionId(xmax))
+		(mode == LockTupleKeyShare ?
+		 (old_infomask & (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK)) :
+		 (old_infomask & HEAP_XMAX_EXCL_LOCK) &&
+		TransactionIdIsCurrentTransactionId(xmax)))
 	{
 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
 		/* Probably can't hold tuple lock here, but may as well check */
 		if (have_tuple_lock)
-			UnlockTuple(relation, tid, tuple_lock_type);
+			UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode));
 		return HeapTupleMayBeUpdated;
 	}
 
@@ -3425,22 +3833,69 @@ l3:
 	new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
 									HEAP_XMAX_INVALID |
 									HEAP_XMAX_IS_MULTI |
-									HEAP_IS_LOCKED |
+									HEAP_LOCK_BITS |
 									HEAP_MOVED);
 
-	if (mode == LockTupleShared)
+	/*
+	 * if we have keep_xmax, this is easy to compute -- just create a new mxact
+	 * including our new xid plus whatever there was on Xmax previously.
+	 */
+	if (TransactionIdIsValid(keep_xmax))
 	{
-		/*
-		 * If this is the first acquisition of a shared lock in the current
-		 * transaction, set my per-backend OldestMemberMXactId setting. We can
-		 * be certain that the transaction will never become a member of any
-		 * older MultiXactIds than that.  (We have to do this even if we end
-		 * up just using our own TransactionId below, since some other backend
-		 * could incorporate our XID into a MultiXact immediately afterwards.)
-		 */
-		MultiXactIdSetOldestMember();
+		if (keep_xmax_multi)
+		{
+			/*
+			 * MultiXactIdExpand takes care to remove members that are no
+			 * longer current.
+			 */
+			xid = MultiXactIdExpand((MultiXactId) keep_xmax, xid,
+									get_mxact_status_for_tuplelock(mode));
+			new_infomask |= GetMultiXactIdHintBits(xid);
+		}
+		else if (TransactionIdIsInProgress(keep_xmax))
+		{
+			MultiXactStatus		existing_lock_mode;
 
-		new_infomask |= HEAP_XMAX_SHARED_LOCK;
+			if (old_infomask & HEAP_XMAX_EXCL_LOCK)
+				existing_lock_mode = MultiXactStatusForUpdate;
+			else if (old_infomask & HEAP_XMAX_KEYSHR_LOCK)
+				existing_lock_mode = MultiXactStatusForKeyShare;
+			else
+				/* must be a shared lock */
+				existing_lock_mode = MultiXactStatusForShare;
+
+			xid = MultiXactIdCreate(keep_xmax, existing_lock_mode,
+									xid, get_mxact_status_for_tuplelock(mode));
+			new_infomask |= GetMultiXactIdHintBits(xid);
+		}
+		else
+		{
+			/*
+			 * Not multi, not in progress.  Use only our own Xid.
+			 */
+			switch (mode)
+			{
+				case LockTupleKeyShare:
+					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+					break;
+				case LockTupleShare:
+					/* need a multixact here in any case */
+					xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare);
+					new_infomask |= GetMultiXactIdHintBits(xid);
+					break;
+				case LockTupleUpdate:
+					new_infomask |= HEAP_XMAX_EXCL_LOCK;
+					break;
+				default:
+					elog(ERROR, "invalid lock mode");
+			}
+		}
+	}
+	else
+	{
+		MultiXactStatus		new_mxact_status;
+
+		new_mxact_status = get_mxact_status_for_tuplelock(mode);
 
 		/*
 		 * Check to see if we need a MultiXactId because there are multiple
@@ -3465,8 +3920,9 @@ l3:
 				 * If the XMAX is already a MultiXactId, then we need to
 				 * expand it to include our own TransactionId.
 				 */
-				xid = MultiXactIdExpand((MultiXactId) xmax, xid);
-				new_infomask |= HEAP_XMAX_IS_MULTI;
+				xid = MultiXactIdExpand((MultiXactId) xmax, xid, new_mxact_status);
+				new_infomask |= GetMultiXactIdHintBits(xid);
+				/* FIXME -- we need to add bits to the infomask here! */
 			}
 			else if (TransactionIdIsInProgress(xmax))
 			{
@@ -3475,8 +3931,30 @@ l3:
 				 * create a new MultiXactId that includes both the old locker
 				 * and our own TransactionId.
 				 */
-				xid = MultiXactIdCreate(xmax, xid);
-				new_infomask |= HEAP_XMAX_IS_MULTI;
+				MultiXactStatus status;
+
+				if (old_infomask & HEAP_XMAX_EXCL_LOCK)
+					status = MultiXactStatusForUpdate;
+				else if (old_infomask & HEAP_XMAX_KEYSHR_LOCK)
+					status = MultiXactStatusForKeyShare;
+				else
+				{
+					status = 0;		/* keep compiler quiet */
+					elog(ERROR, "no lock bit found on old infomask %u", old_infomask);
+				}
+
+				xid = MultiXactIdCreate(xmax, status, xid, new_mxact_status);
+				new_infomask |= GetMultiXactIdHintBits(xid);
+				/* FIXME -- we need to add bits to the infomask here! */
+			}
+			else if (mode == LockTupleShare)
+			{
+				/*
+				 * There's no hint bit for FOR SHARE, so we need a multixact
+				 * here no matter what.
+				 */
+				xid = MultiXactIdCreateSingleton(xid, new_mxact_status);
+				new_infomask |= GetMultiXactIdHintBits(xid);
 			}
 			else
 			{
@@ -3486,6 +3964,22 @@ l3:
 				 * TransactionIdIsInProgress() got to run.	Treat it like
 				 * there's no locker in the tuple.
 				 */
+				switch (mode)
+				{
+					case LockTupleKeyShare:
+						new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+						break;
+					case LockTupleShare:
+						/* need a multixact here in any case */
+						xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare);
+						new_infomask |= GetMultiXactIdHintBits(xid);
+						break;
+					case LockTupleUpdate:
+						new_infomask |= HEAP_XMAX_EXCL_LOCK;
+						break;
+					default:
+						elog(ERROR, "invalid lock mode");
+				}
 			}
 		}
 		else
@@ -3494,13 +3988,24 @@ l3:
 			 * There was no previous locker, so just insert our own
 			 * TransactionId.
 			 */
+			switch (mode)
+			{
+				case LockTupleKeyShare:
+					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+					break;
+				case LockTupleShare:
+					/* need a multixact here in any case */
+					xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare);
+					new_infomask |= GetMultiXactIdHintBits(xid);
+					break;
+				case LockTupleUpdate:
+					new_infomask |= HEAP_XMAX_EXCL_LOCK;
+					break;
+				default:
+					elog(ERROR, "invalid lock mode");
+			}
 		}
 	}
-	else
-	{
-		/* We want an exclusive lock on the tuple */
-		new_infomask |= HEAP_XMAX_EXCL_LOCK;
-	}
 
 	START_CRIT_SECTION();
 
@@ -3508,12 +4013,14 @@ l3:
 	 * Store transaction information of xact locking the tuple.
 	 *
 	 * Note: Cmax is meaningless in this context, so don't set it; this avoids
-	 * possibly generating a useless combo CID.
+	 * possibly generating a useless combo CID.  FIXME -- it's not useless
+	 * if a multixact contains an update.
 	 */
 	tuple->t_data->t_infomask = new_infomask;
 	HeapTupleHeaderClearHotUpdated(tuple->t_data);
 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
 	/* Make sure there is no forward chain link in t_ctid */
+	/* FIXME -- this needs some thought */
 	tuple->t_data->t_ctid = *tid;
 
 	MarkBufferDirty(*buffer);
@@ -3539,8 +4046,17 @@ l3:
 		xlrec.target.node = relation->rd_node;
 		xlrec.target.tid = tuple->t_self;
 		xlrec.locking_xid = xid;
-		xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
-		xlrec.shared_lock = (mode == LockTupleShared);
+		xlrec.infobits_set =
+			(((new_infomask & HEAP_XMAX_IS_MULTI) != 0) ?
+			 XLHL_XMAX_IS_MULTI : 0) |
+			(((new_infomask & HEAP_XMAX_IS_NOT_UPDATE) != 0) ?
+			 XLHL_XMAX_IS_NOT_UPDATE : 0) |
+			(((new_infomask & HEAP_XMAX_EXCL_LOCK) != 0) ?
+			 XLHL_XMAX_EXCL_LOCK : 0) |
+			(((new_infomask & HEAP_XMAX_KEYSHR_LOCK) != 0) ?
+			 XLHL_XMAX_KEYSHR_LOCK : 0) |
+			(((tuple->t_data->t_infomask2 & HEAP_UPDATE_KEY_INTACT) != 0) ?
+			 XLHL_XMAX_KEYSHR_LOCK : 0);
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfHeapLock;
 		rdata[0].buffer = InvalidBuffer;
@@ -3572,7 +4088,7 @@ l3:
 	 * release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, tid, tuple_lock_type);
+		UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode));
 
 	return HeapTupleMayBeUpdated;
 }
@@ -3789,6 +4305,8 @@ recheck_xmax:
 		 * extremely low-probability scenario with minimal downside even if
 		 * it does happen, so for now we don't do the extra bookkeeping that
 		 * would be needed to clean out MultiXactIds.
+		 *
+		 * FIXME -- today is that day.  Figure this out.
 		 *----------
 		 */
 	}
@@ -3841,6 +4359,105 @@ recheck_xvac:
 	return changed;
 }
 
+/*
+ * For a given MultiXactId, return the hint bits that should be set in the
+ * tuple's infomask.
+ *
+ * Normally this should be called for a multixact that was just created, and
+ * so is on our local cache, so the GetMembers call is fast.
+ */
+static uint16
+GetMultiXactIdHintBits(MultiXactId multi)
+{
+	int		nmembers;
+	MultiXactMember	*members;
+	int		i;
+	uint16	bits = HEAP_XMAX_IS_MULTI;
+	bool	has_update = false;
+
+	nmembers = GetMultiXactIdMembers(multi, &members);
+
+	for (i = 0; i < nmembers; i++)
+	{
+		Assert(members[i].status != MultiXactStatusKeyUpdate);
+		switch (members[i].status)
+		{
+			case MultiXactStatusForKeyShare:
+				bits |= HEAP_XMAX_KEYSHR_LOCK;
+				break;
+			case MultiXactStatusForShare:
+				break;
+			case MultiXactStatusForUpdate:
+				Assert(!has_update);
+				bits |= HEAP_XMAX_EXCL_LOCK;
+				break;
+			case MultiXactStatusUpdate:
+				Assert(!(bits & HEAP_XMAX_EXCL_LOCK));
+				has_update = true;
+				break;
+			case MultiXactStatusKeyUpdate:
+				elog(ERROR, "invalid multixact value");
+				break;
+		}
+	}
+	if (!has_update)
+		bits |= HEAP_XMAX_IS_NOT_UPDATE;
+
+	return bits;
+}
+
+/*
+ * HeapTupleGetUpdateXid
+ *
+ * Given a tuple with a multixact Xmax, and which does not have the
+ * HEAP_XMAX_IS_NOT_UPDATE bit set, obtain and return the Xid of the updating
+ * transaction.
+ *
+ * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
+ * checking the hint bits.
+ */
+TransactionId
+HeapTupleGetUpdateXid(HeapTupleHeader tuple)
+{
+	TransactionId	update_xact = InvalidTransactionId;
+	MultiXactMember	*members;
+	int				nmembers;
+
+	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_NOT_UPDATE));
+	Assert(tuple->t_infomask & HEAP_XMAX_IS_MULTI);
+
+	nmembers = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple), &members);
+
+	if (nmembers > 0)
+	{
+		int		i;
+
+		for (i = 0; i < nmembers; i++)
+		{
+			/* KEY SHARE lockers are okay -- ignore it */
+			if (members[i].status == MultiXactStatusForKeyShare)
+				continue;
+			/*
+			 * SHARE lockers are okay, though since they normally conflict with
+			 * UPDATE, they are not expected unless they come from the same
+			 * xact as the update.
+			 */
+			if (members[i].status == MultiXactStatusForShare ||
+				members[i].status == MultiXactStatusForUpdate)
+				continue;
+			/* there should be at most one updater */
+			Assert(update_xact == InvalidTransactionId);
+			Assert(members[i].status == MultiXactStatusUpdate);
+			update_xact = members[i].xid;
+#ifndef USE_ASSERT_CHECKING
+			break;
+#endif
+		}
+	}
+
+	return update_xact;
+}
+
 
 /* ----------------
  *		heap_markpos	- mark scan position
@@ -3919,6 +4536,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 									   TransactionId *latestRemovedXid)
 {
 	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	/* FIXME -- change this? */
 	TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
 	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
 
@@ -4606,7 +5224,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 						  HEAP_XMAX_INVALID |
 						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
+						  HEAP_LOCK_BITS |
 						  HEAP_MOVED);
 	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, record->xl_xid);
@@ -4813,7 +5431,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 						  HEAP_XMAX_INVALID |
 						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
+						  HEAP_LOCK_BITS |
 						  HEAP_MOVED);
 	if (hot_update)
 		HeapTupleHeaderSetHotUpdated(htup);
@@ -4991,14 +5609,18 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 						  HEAP_XMAX_INVALID |
 						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
+						  HEAP_LOCK_BITS |
 						  HEAP_MOVED);
-	if (xlrec->xid_is_mxact)
+	if (xlrec->infobits_set & XLHL_XMAX_IS_MULTI)
 		htup->t_infomask |= HEAP_XMAX_IS_MULTI;
-	if (xlrec->shared_lock)
-		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
-	else
+	if (xlrec->infobits_set & XLHL_XMAX_IS_NOT_UPDATE)
+		htup->t_infomask |= HEAP_XMAX_IS_NOT_UPDATE;
+	if (xlrec->infobits_set & XLHL_XMAX_EXCL_LOCK)
 		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+	if (xlrec->infobits_set & XLHL_XMAX_KEYSHR_LOCK)
+		htup->t_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+	if (xlrec->infobits_set & XLHL_UPDATE_KEY_INTACT)
+		htup->t_infomask2 |= HEAP_UPDATE_KEY_INTACT;
 	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
@@ -5202,16 +5824,19 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_heap_lock *xlrec = (xl_heap_lock *) rec;
 
-		if (xlrec->shared_lock)
-			appendStringInfo(buf, "shared_lock: ");
-		else
-			appendStringInfo(buf, "exclusive_lock: ");
-		if (xlrec->xid_is_mxact)
-			appendStringInfo(buf, "mxid ");
-		else
-			appendStringInfo(buf, "xid ");
-		appendStringInfo(buf, "%u ", xlrec->locking_xid);
+		appendStringInfo(buf, "lock %u: ", xlrec->locking_xid);
 		out_target(buf, &(xlrec->target));
+		appendStringInfoChar(buf, ' ');
+		if (xlrec->infobits_set & XLHL_XMAX_IS_MULTI)
+			appendStringInfo(buf, "XMAX_IS_MULTI ");
+		if (xlrec->infobits_set & XLHL_XMAX_IS_NOT_UPDATE)
+			appendStringInfo(buf, "XMAX_IS_NOT_UPDATE ");
+		if (xlrec->infobits_set & XLHL_XMAX_EXCL_LOCK)
+			appendStringInfo(buf, "XMAX_EXCL_LOCK ");
+		if (xlrec->infobits_set & XLHL_XMAX_KEYSHR_LOCK)
+			appendStringInfo(buf, "XMAX_KEYSHR_LOCK ");
+		if (xlrec->infobits_set & XLHL_UPDATE_KEY_INTACT)
+			appendStringInfo(buf, "UPDATE_KEY_INTACT ");
 	}
 	else if (info == XLOG_HEAP_INPLACE)
 	{
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index e561409..3469ebe 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -352,15 +352,15 @@ rewrite_heap_tuple(RewriteState state,
 	/*
 	 * If the tuple has been updated, check the old-to-new mapping hash table.
 	 */
-	if (!(old_tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-										   HEAP_IS_LOCKED)) &&
+	if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+		  HeapTupleHeaderIsLocked(old_tuple->t_data)) &&
 		!(ItemPointerEquals(&(old_tuple->t_self),
 							&(old_tuple->t_data->t_ctid))))
 	{
 		OldToNewMapping mapping;
 
 		memset(&hashkey, 0, sizeof(hashkey));
-		hashkey.xmin = HeapTupleHeaderGetXmax(old_tuple->t_data);
+		hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data);
 		hashkey.tid = old_tuple->t_data->t_ctid;
 
 		mapping = (OldToNewMapping)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index c1c8ba5..a4dc146 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -4,7 +4,7 @@
  *		PostgreSQL multi-transaction-log manager
  *
  * The pg_multixact manager is a pg_clog-like manager that stores an array
- * of TransactionIds for each MultiXactId.	It is a fundamental part of the
+ * of MultiXactMember for each MultiXactId.	It is a fundamental part of the
  * shared-row-lock implementation.	A share-locked tuple stores a
  * MultiXactId in its Xmax, and a transaction that needs to wait for the
  * tuple to be unlocked can sleep on the potentially-several TransactionIds
@@ -48,6 +48,8 @@
  */
 #include "postgres.h"
 
+#include <unistd.h>
+
 #include "access/multixact.h"
 #include "access/slru.h"
 #include "access/transam.h"
@@ -60,6 +62,7 @@
 #include "storage/procarray.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -75,19 +78,58 @@
  * (see MultiXact{Offset,Member}PagePrecedes).
  */
 
-/* We need four bytes per offset and also four bytes per member */
+/* We need four bytes per offset */
 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
-#define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
 
 #define MultiXactIdToOffsetPage(xid) \
 	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 #define MultiXactIdToOffsetEntry(xid) \
 	((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 
-#define MXOffsetToMemberPage(xid) \
-	((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
-#define MXOffsetToMemberEntry(xid) \
-	((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+/*
+ * The situation for members is a bit more complex: we need to store two
+ * additional flag bits for each TransactionId.  To do this without getting
+ * into alignment issues, we store four bytes of flags (so 16 bit pairs), and
+ * then the corresponding 16 Xids.  Each such 17-word (68-byte) set we call a
+ * "group", and are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep
+ * 120 groups per page.  This wastes 32 bytes per page, but that's OK --
+ * simplicity (and performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need two bits per xact, so four xacts fit in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT			2
+#define MXACT_MEMBER_FLAGS_PER_BYTE			4
+#define MXACT_MEMBER_XACT_BITMASK	((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP		4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP	\
+	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE	\
+	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+
+/* Location (byte offset within page) of flag word for a given member */
+#define MXOffsetToFlagsOffset(xid) \
+	((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
+	  (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
+	 (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
+#define MXOffsetToFlagsBitShift(xid) \
+	(((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
+	 MXACT_MEMBER_BITS_PER_XACT)
+
+/* Location (byte offset within page) of TransactionId of given member */
+#define MXOffsetToMemberOffset(xid) \
+	(MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
+	 ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
 
 
 /*
@@ -114,60 +156,51 @@ typedef struct MultiXactStateData
 	/* next-to-be-assigned offset */
 	MultiXactOffset nextOffset;
 
-	/* the Offset SLRU area was last truncated at this MultiXactId */
-	MultiXactId lastTruncationPoint;
+	/* truncation info for the oldest segment in the offset SLRU area */
+	TransactionId	truncateXid;
+	uint32			truncateXidEpoch;
 
 	/*
-	 * Per-backend data starts here.  We have two arrays stored in the area
-	 * immediately following the MultiXactStateData struct. Each is indexed by
-	 * BackendId.
-	 *
-	 * In both arrays, there's a slot for all normal backends (1..MaxBackends)
-	 * followed by a slot for max_prepared_xacts prepared transactions. Valid
-	 * BackendIds start from 1; element zero of each array is never used.
-	 *
-	 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
-	 * transaction(s) could possibly be a member of, or InvalidMultiXactId
-	 * when the backend has no live transaction that could possibly be a
-	 * member of a MultiXact.  Each backend sets its entry to the current
-	 * nextMXact counter just before first acquiring a shared lock in a given
-	 * transaction, and clears it at transaction end. (This works because only
-	 * during or after acquiring a shared lock could an XID possibly become a
-	 * member of a MultiXact, and that MultiXact would have to be created
-	 * during or after the lock acquisition.)
-	 *
-	 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
-	 * current transaction(s) think is potentially live, or InvalidMultiXactId
-	 * when not in a transaction or not in a transaction that's paid any
-	 * attention to MultiXacts yet.  This is computed when first needed in a
-	 * given transaction, and cleared at transaction end.  We can compute it
-	 * as the minimum of the valid OldestMemberMXactId[] entries at the time
-	 * we compute it (using nextMXact if none are valid).  Each backend is
-	 * required not to attempt to access any SLRU data for MultiXactIds older
-	 * than its own OldestVisibleMXactId[] setting; this is necessary because
-	 * the checkpointer could truncate away such data at any instant.
-	 *
-	 * The checkpointer can compute the safe truncation point as the oldest
-	 * valid value among all the OldestMemberMXactId[] and
-	 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
-	 * Clearly, it is not possible for any later-computed OldestVisibleMXactId
-	 * value to be older than this, and so there is no risk of truncating data
-	 * that is still needed.
+	 * oldest multixact that is still on disk.  Anything older than this should
+	 * not be consulted.
 	 */
-	MultiXactId perBackendXactIds[1];	/* VARIABLE LENGTH ARRAY */
+	MultiXactId		oldestMultiXactId;
 } MultiXactStateData;
 
+/* Pointer to the state data in shared memory */
+static MultiXactStateData *MultiXactState;
+
+#define firstPageOf(segment) ((segment) * SLRU_PAGES_PER_SEGMENT)
+
 /*
- * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays.
- * Valid elements are (1..MaxOldestSlot); element 0 is never used.
+ * structs to pass data around in our private SlruScanDirectory callback for
+ * the offset truncation support code.
  */
-#define MaxOldestSlot	(MaxBackends + max_prepared_xacts)
+typedef struct SegmentInfo
+{
+	int				segno;			/* segment number */
+	TransactionId	truncateXid;	/* after this Xid is frozen, the previous
+									 * segment can be removed */
+	uint32			truncateXidEpoch;	/* epoch of above Xid */
+	MultiXactOffset	firstOffset;	/* first valid offset in segment */
+} SegmentInfo;
 
-/* Pointers to the state data in shared memory */
-static MultiXactStateData *MultiXactState;
-static MultiXactId *OldestMemberMXactId;
-static MultiXactId *OldestVisibleMXactId;
+typedef struct TruncateCbData
+{
+	int				remaining_alloc;
+	int				remaining_used;
+	SegmentInfo	   *remaining;
+} TruncateCbData;
 
+/*
+ * MultiXactZeroOffsetPage xlog record
+ */
+typedef struct MxactZeroOffPg
+{
+	int				pageno;
+	TransactionId	truncateXid;
+	TransactionId	truncateXidEpoch;
+} MxactZeroOffPg;
 
 /*
  * Definitions for the backend-local MultiXactId cache.
@@ -180,7 +213,8 @@ static MultiXactId *OldestVisibleMXactId;
  * so they will be uninteresting by the time our next transaction starts.
  * (XXX not clear that this is correct --- other members of the MultiXact
  * could hang around longer than we did.  However, it's not clear what a
- * better policy for flushing old cache entries would be.)
+ * better policy for flushing old cache entries would be.)  FIXME actually
+ * this is plain wrong now that multixact's may contain update Xids.
  *
  * We allocate the cache entries in a memory context that is deleted at
  * transaction end, so we don't need to do retail freeing of entries.
@@ -189,44 +223,72 @@ typedef struct mXactCacheEnt
 {
 	struct mXactCacheEnt *next;
 	MultiXactId multi;
-	int			nxids;
-	TransactionId xids[1];		/* VARIABLE LENGTH ARRAY */
+	int			nmembers;
+	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } mXactCacheEnt;
 
 static mXactCacheEnt *MXactCache = NULL;
 static MemoryContext MXactContext = NULL;
 
+/* status conflict table */
+static const bool MultiXactConflicts[5][5] =
+{
+	{	/* ForKeyShare */
+		false, false, false, false, true
+	},
+	{	/* ForShare */
+		false, false, true, true, true
+	},
+	{	/* ForUpdate */
+		false, true, true, true, true
+	},
+	{	/* Update */
+		false, true, true, true, true
+	},
+	{	/* KeyUpdate */
+		true, true, true, true, true
+	}
+};
+
+#define MultiXactStatusConflict(status1, status2) \
+	MultiXactConflicts[status1][status2]
 
+
+#define MULTIXACT_DEBUG
 #ifdef MULTIXACT_DEBUG
 #define debug_elog2(a,b) elog(a,b)
 #define debug_elog3(a,b,c) elog(a,b,c)
 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
+#define debug_elog7(a,b,c,d,e,f,g) elog(a,b,c,d,e,f,g)
 #else
 #define debug_elog2(a,b)
 #define debug_elog3(a,b,c)
 #define debug_elog4(a,b,c,d)
 #define debug_elog5(a,b,c,d,e)
+#define debug_elog7(a,b,c,d,e,f,g)
 #endif
 
 /* internal MultiXactId management */
-static void MultiXactIdSetOldestVisible(void);
-static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids);
+static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-				   int nxids, TransactionId *xids);
-static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset);
+				   int nmembers, MultiXactMember *members);
+static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
+static MultiXactId HandleMxactOffsetCornerCases(MultiXactId multi);
 
 /* MultiXact cache management */
-static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids);
-static int	mXactCacheGetById(MultiXactId multi, TransactionId **xids);
-static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids);
+static int mxactMemberComparator(const void *arg1, const void *arg2);
+static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
+static int	mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
+static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members);
 
 #ifdef MULTIXACT_DEBUG
-static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids);
+static char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members);
 #endif
 
 /* management of SLRU infrastructure */
-static int	ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
+static int	ZeroMultiXactOffsetPage(int pageno, bool writeXlog,
+						TransactionId truncateXid, uint32 truncateXidEpoch);
 static int	ZeroMultiXactMemberPage(int pageno, bool writeXlog);
 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
 static bool MultiXactMemberPagePrecedes(int page1, int page2);
@@ -235,29 +297,59 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 						MultiXactOffset offset2);
 static void ExtendMultiXactOffset(MultiXactId multi);
 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static void TruncateMultiXact(void);
-static void WriteMZeroPageXlogRec(int pageno, uint8 info);
+static void fillSegmentInfoData(SlruCtl ctl, SegmentInfo *segment);
+static int	compareTruncateXidEpoch(const void *a, const void *b);
+static void WriteMZeroOffsetPageXlogRec(int pageno, TransactionId truncateXid,
+							uint32 truncateXidEpoch);
+static void WriteMZeroMemberPageXlogRec(int pageno);
 
 
 /*
+ * MultiXactIdCreateSingleton
+ * 		Construct a MultiXactId representing a single transaction.
+ *
+ * NB - we don't worry about our local MultiXactId cache here, because that
+ * is handled by the lower-level routines.
+ */
+MultiXactId
+MultiXactIdCreateSingleton(TransactionId xid, MultiXactStatus status)
+{
+	MultiXactId	newMulti;
+	MultiXactMember	member[1];
+
+	AssertArg(TransactionIdIsValid(xid));
+
+	member[0].xid = xid;
+	member[0].status = status;
+
+	newMulti = CreateMultiXactId(1, member);
+
+	debug_elog4(DEBUG2, "Create: returning %u for %u",
+			   newMulti, xid);
+
+	return newMulti;
+}
+
+/*
  * MultiXactIdCreate
  *		Construct a MultiXactId representing two TransactionIds.
  *
- * The two XIDs must be different.
+ * The two XIDs must be different, or be requesting different lock modes.
  *
  * NB - we don't worry about our local MultiXactId cache here, because that
  * is handled by the lower-level routines.
  */
 MultiXactId
-MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
+MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
+				  TransactionId xid2, MultiXactStatus status2)
 {
 	MultiXactId newMulti;
-	TransactionId xids[2];
+	MultiXactMember members[2];
 
 	AssertArg(TransactionIdIsValid(xid1));
 	AssertArg(TransactionIdIsValid(xid2));
 
-	Assert(!TransactionIdEquals(xid1, xid2));
+	Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
 
 	/*
 	 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
@@ -265,11 +357,14 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
 	 * caller just did a check on xid1, so it'd be wasted effort.
 	 */
 
-	xids[0] = xid1;
-	xids[1] = xid2;
+	members[0].xid = xid1;
+	members[0].status = status1;
+	members[1].xid = xid2;
+	members[1].status = status2;
 
-	newMulti = CreateMultiXactId(2, xids);
+	newMulti = CreateMultiXactId(2, members);
 
+	/* XXX -- need better debug? */
 	debug_elog5(DEBUG2, "Create: returning %u for %u, %u",
 				newMulti, xid1, xid2);
 
@@ -280,8 +375,8 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
  * MultiXactIdExpand
  *		Add a TransactionId to a pre-existing MultiXactId.
  *
- * If the TransactionId is already a member of the passed MultiXactId,
- * just return it as-is.
+ * If the TransactionId is already a member of the passed MultiXactId with the
+ * same status, just return it as-is.
  *
  * Note that we do NOT actually modify the membership of a pre-existing
  * MultiXactId; instead we create a new one.  This is necessary to avoid
@@ -291,11 +386,11 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
  * is handled by the lower-level routines.
  */
 MultiXactId
-MultiXactIdExpand(MultiXactId multi, TransactionId xid)
+MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 {
 	MultiXactId newMulti;
-	TransactionId *members;
-	TransactionId *newMembers;
+	MultiXactMember *members;
+	MultiXactMember *newMembers;
 	int			nmembers;
 	int			i;
 	int			j;
@@ -310,6 +405,8 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 
 	if (nmembers < 0)
 	{
+		MultiXactMember		member;
+
 		/*
 		 * The MultiXactId is obsolete.  This can only happen if all the
 		 * MultiXactId members stop running between the caller checking and
@@ -317,7 +414,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 		 * caller, but it would complicate the API and it's unlikely to happen
 		 * too often, so just deal with it by creating a singleton MultiXact.
 		 */
-		newMulti = CreateMultiXactId(1, &xid);
+		member.xid = xid;
+		member.status = status;
+		newMulti = CreateMultiXactId(1, &member);
 
 		debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 					multi, newMulti);
@@ -325,12 +424,13 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 	}
 
 	/*
-	 * If the TransactionId is already a member of the MultiXactId, just
-	 * return the existing MultiXactId.
+	 * If the TransactionId is already a member of the MultiXactId with the
+	 * same status, just return the existing MultiXactId.
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdEquals(members[i], xid))
+		if (TransactionIdEquals(members[i].xid, xid) &&
+			(members[i].status == status))
 		{
 			debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
 						xid, multi);
@@ -345,16 +445,20 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 	 * optimization, but a useful one.	Note we have the same race condition
 	 * here as above: j could be 0 at the end of the loop.)
 	 */
-	newMembers = (TransactionId *)
-		palloc(sizeof(TransactionId) * (nmembers + 1));
+	newMembers = (MultiXactMember *)
+		palloc(sizeof(MultiXactMember) * (nmembers + 1));
 
 	for (i = 0, j = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsInProgress(members[i]))
-			newMembers[j++] = members[i];
+		if (TransactionIdIsInProgress(members[i].xid))
+		{
+			newMembers[j].xid = members[i].xid;
+			newMembers[j++].status = members[i].status;
+		}
 	}
 
-	newMembers[j++] = xid;
+	newMembers[j].xid = xid;
+	newMembers[j++].status = status;
 	newMulti = CreateMultiXactId(j, newMembers);
 
 	pfree(members);
@@ -376,7 +480,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 bool
 MultiXactIdIsRunning(MultiXactId multi)
 {
-	TransactionId *members;
+	MultiXactMember *members;
 	int			nmembers;
 	int			i;
 
@@ -397,7 +501,7 @@ MultiXactIdIsRunning(MultiXactId multi)
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsCurrentTransactionId(members[i]))
+		if (TransactionIdIsCurrentTransactionId(members[i].xid))
 		{
 			debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
 			pfree(members);
@@ -412,10 +516,10 @@ MultiXactIdIsRunning(MultiXactId multi)
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsInProgress(members[i]))
+		if (TransactionIdIsInProgress(members[i].xid))
 		{
 			debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
-						i, members[i]);
+						i, members[i].xid);
 			pfree(members);
 			return true;
 		}
@@ -429,145 +533,6 @@ MultiXactIdIsRunning(MultiXactId multi)
 }
 
 /*
- * MultiXactIdIsCurrent
- *		Returns true if the current transaction is a member of the MultiXactId.
- *
- * We return true if any live subtransaction of the current top-level
- * transaction is a member.  This is appropriate for the same reason that a
- * lock held by any such subtransaction is globally equivalent to a lock
- * held by the current subtransaction: no such lock could be released without
- * aborting this subtransaction, and hence releasing its locks.  So it's not
- * necessary to add the current subxact to the MultiXact separately.
- */
-bool
-MultiXactIdIsCurrent(MultiXactId multi)
-{
-	bool		result = false;
-	TransactionId *members;
-	int			nmembers;
-	int			i;
-
-	nmembers = GetMultiXactIdMembers(multi, &members);
-
-	if (nmembers < 0)
-		return false;
-
-	for (i = 0; i < nmembers; i++)
-	{
-		if (TransactionIdIsCurrentTransactionId(members[i]))
-		{
-			result = true;
-			break;
-		}
-	}
-
-	pfree(members);
-
-	return result;
-}
-
-/*
- * MultiXactIdSetOldestMember
- *		Save the oldest MultiXactId this transaction could be a member of.
- *
- * We set the OldestMemberMXactId for a given transaction the first time
- * it's going to acquire a shared lock.  We need to do this even if we end
- * up using a TransactionId instead of a MultiXactId, because there is a
- * chance that another transaction would add our XID to a MultiXactId.
- *
- * The value to set is the next-to-be-assigned MultiXactId, so this is meant
- * to be called just before acquiring a shared lock.
- */
-void
-MultiXactIdSetOldestMember(void)
-{
-	if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
-	{
-		MultiXactId nextMXact;
-
-		/*
-		 * You might think we don't need to acquire a lock here, since
-		 * fetching and storing of TransactionIds is probably atomic, but in
-		 * fact we do: suppose we pick up nextMXact and then lose the CPU for
-		 * a long time.  Someone else could advance nextMXact, and then
-		 * another someone else could compute an OldestVisibleMXactId that
-		 * would be after the value we are going to store when we get control
-		 * back.  Which would be wrong.
-		 */
-		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
-
-		/*
-		 * We have to beware of the possibility that nextMXact is in the
-		 * wrapped-around state.  We don't fix the counter itself here, but we
-		 * must be sure to store a valid value in our array entry.
-		 */
-		nextMXact = MultiXactState->nextMXact;
-		if (nextMXact < FirstMultiXactId)
-			nextMXact = FirstMultiXactId;
-
-		OldestMemberMXactId[MyBackendId] = nextMXact;
-
-		LWLockRelease(MultiXactGenLock);
-
-		debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
-					MyBackendId, nextMXact);
-	}
-}
-
-/*
- * MultiXactIdSetOldestVisible
- *		Save the oldest MultiXactId this transaction considers possibly live.
- *
- * We set the OldestVisibleMXactId for a given transaction the first time
- * it's going to inspect any MultiXactId.  Once we have set this, we are
- * guaranteed that the checkpointer won't truncate off SLRU data for
- * MultiXactIds at or after our OldestVisibleMXactId.
- *
- * The value to set is the oldest of nextMXact and all the valid per-backend
- * OldestMemberMXactId[] entries.  Because of the locking we do, we can be
- * certain that no subsequent call to MultiXactIdSetOldestMember can set
- * an OldestMemberMXactId[] entry older than what we compute here.	Therefore
- * there is no live transaction, now or later, that can be a member of any
- * MultiXactId older than the OldestVisibleMXactId we compute here.
- */
-static void
-MultiXactIdSetOldestVisible(void)
-{
-	if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
-	{
-		MultiXactId oldestMXact;
-		int			i;
-
-		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
-
-		/*
-		 * We have to beware of the possibility that nextMXact is in the
-		 * wrapped-around state.  We don't fix the counter itself here, but we
-		 * must be sure to store a valid value in our array entry.
-		 */
-		oldestMXact = MultiXactState->nextMXact;
-		if (oldestMXact < FirstMultiXactId)
-			oldestMXact = FirstMultiXactId;
-
-		for (i = 1; i <= MaxOldestSlot; i++)
-		{
-			MultiXactId thisoldest = OldestMemberMXactId[i];
-
-			if (MultiXactIdIsValid(thisoldest) &&
-				MultiXactIdPrecedes(thisoldest, oldestMXact))
-				oldestMXact = thisoldest;
-		}
-
-		OldestVisibleMXactId[MyBackendId] = oldestMXact;
-
-		LWLockRelease(MultiXactGenLock);
-
-		debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
-					MyBackendId, oldestMXact);
-	}
-}
-
-/*
  * MultiXactIdWait
  *		Sleep on a MultiXactId.
  *
@@ -576,17 +541,24 @@ MultiXactIdSetOldestVisible(void)
  * this would not merely be useless but would lead to Assert failure inside
  * XactLockTableWait.  By the time this returns, it is certain that all
  * transactions *of other backends* that were members of the MultiXactId
- * are dead (and no new ones can have been added, since it is not legal
- * to add members to an existing MultiXactId).
+ * that conflict with the requested status are dead (and no new ones can have
+ * been added, since it is not legal to add members to an existing
+ * MultiXactId).
+ *
+ * We return the number of members that we did not test for.  This is dubbed
+ * "remaining" as in "the number of members that remaing running", but this is
+ * slightly incorrect, because lockers whose status did not conflict with ours
+ * are not even considered and so might have gone away anyway.
  *
  * But by the time we finish sleeping, someone else may have changed the Xmax
  * of the containing tuple, so the caller needs to iterate on us somehow.
  */
 void
-MultiXactIdWait(MultiXactId multi)
+MultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining)
 {
-	TransactionId *members;
+	MultiXactMember *members;
 	int			nmembers;
+	int			remain = 0;
 
 	nmembers = GetMultiXactIdMembers(multi, &members);
 
@@ -596,28 +568,37 @@ MultiXactIdWait(MultiXactId multi)
 
 		for (i = 0; i < nmembers; i++)
 		{
-			TransactionId member = members[i];
-
 			debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)",
-						i, member);
-			if (!TransactionIdIsCurrentTransactionId(member))
-				XactLockTableWait(member);
-		}
+						i, members[i].xid);
+			if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
+				!MultiXactStatusConflict(members[i].status, status))
+			{
+				remain++;
+				continue;
+			}
 
-		pfree(members);
+			XactLockTableWait(members[i].xid);
+		}
 	}
+
+	*remaining = remain;
 }
 
 /*
  * ConditionalMultiXactIdWait
  *		As above, but only lock if we can get the lock without blocking.
+ *
+ * Note that in case we return false, the number of remaining members is
+ * not to be trusted.
  */
 bool
-ConditionalMultiXactIdWait(MultiXactId multi)
+ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+						   int *remaining)
 {
 	bool		result = true;
-	TransactionId *members;
+	MultiXactMember *members;
 	int			nmembers;
+	int			remain = 0;
 
 	nmembers = GetMultiXactIdMembers(multi, &members);
 
@@ -627,21 +608,26 @@ ConditionalMultiXactIdWait(MultiXactId multi)
 
 		for (i = 0; i < nmembers; i++)
 		{
-			TransactionId member = members[i];
+			TransactionId member = members[i].xid;
 
 			debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)",
 						i, member);
-			if (!TransactionIdIsCurrentTransactionId(member))
+			if (TransactionIdIsCurrentTransactionId(member) ||
+				!MultiXactStatusConflict(members[i].status, status))
 			{
-				result = ConditionalXactLockTableWait(member);
-				if (!result)
-					break;
+				remain++;
+				continue;
 			}
+			result = ConditionalXactLockTableWait(member);
+			if (!result)
+				break;
 		}
 
 		pfree(members);
 	}
 
+	*remaining = remain;
+
 	return result;
 }
 
@@ -652,10 +638,10 @@ ConditionalMultiXactIdWait(MultiXactId multi)
  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
  * given TransactionIds as members.  Returns the newly created MultiXactId.
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members[] array will be sorted in-place.
  */
 static MultiXactId
-CreateMultiXactId(int nxids, TransactionId *xids)
+CreateMultiXactId(int nmembers, MultiXactMember *members)
 {
 	MultiXactId multi;
 	MultiXactOffset offset;
@@ -663,7 +649,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	xl_multixact_create xlrec;
 
 	debug_elog3(DEBUG2, "Create: %s",
-				mxid_to_string(InvalidMultiXactId, nxids, xids));
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
 
 	/*
 	 * See if the same set of XIDs already exists in our cache; if so, just
@@ -675,7 +661,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 * corner cases where someone else added us to a MultiXact without our
 	 * knowledge, but it's not worth checking for.)
 	 */
-	multi = mXactCacheGetBySet(nxids, xids);
+	multi = mXactCacheGetBySet(nmembers, members);
 	if (MultiXactIdIsValid(multi))
 	{
 		debug_elog2(DEBUG2, "Create: in cache!");
@@ -687,7 +673,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 * in the OFFSETs and MEMBERs files.  NB: this routine does
 	 * START_CRIT_SECTION().
 	 */
-	multi = GetNewMultiXactId(nxids, &offset);
+	multi = GetNewMultiXactId(nmembers, &offset);
 
 	/*
 	 * Make an XLOG entry describing the new MXID.
@@ -704,27 +690,32 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 */
 	xlrec.mid = multi;
 	xlrec.moff = offset;
-	xlrec.nxids = nxids;
+	xlrec.nmembers = nmembers;
 
+	/*
+	 * XXX Note: there's a lot of padding space in MultiXactMember.  We could
+	 * find a more compact representation of this Xlog record -- perhaps all the
+	 * status flags in one XLogRecData, then all the xids in another one?
+	 */
 	rdata[0].data = (char *) (&xlrec);
 	rdata[0].len = MinSizeOfMultiXactCreate;
 	rdata[0].buffer = InvalidBuffer;
 	rdata[0].next = &(rdata[1]);
-	rdata[1].data = (char *) xids;
-	rdata[1].len = nxids * sizeof(TransactionId);
+	rdata[1].data = (char *) members;
+	rdata[1].len = nmembers * sizeof(MultiXactMember);
 	rdata[1].buffer = InvalidBuffer;
 	rdata[1].next = NULL;
 
 	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata);
 
 	/* Now enter the information into the OFFSETs and MEMBERs logs */
-	RecordNewMultiXact(multi, offset, nxids, xids);
+	RecordNewMultiXact(multi, offset, nmembers, members);
 
 	/* Done with critical section */
 	END_CRIT_SECTION();
 
 	/* Store the new MultiXactId in the local cache, too */
-	mXactCachePut(multi, nxids, xids);
+	mXactCachePut(multi, nmembers, members);
 
 	debug_elog2(DEBUG2, "Create: all done");
 
@@ -739,7 +730,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
  */
 static void
 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-				   int nxids, TransactionId *xids)
+				   int nmembers, MultiXactMember *members)
 {
 	int			pageno;
 	int			prev_pageno;
@@ -775,12 +766,22 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 
 	prev_pageno = -1;
 
-	for (i = 0; i < nxids; i++, offset++)
+	for (i = 0; i < nmembers; i++, offset++)
 	{
 		TransactionId *memberptr;
+		uint32	   *flagsptr;
+		uint32		flagsval;
+		int			bshift;
+		int			flagsoff;
+		int			memberoff;
+
+		/* this status value is not representable on disk */
+		Assert(members[i].status < MultiXactStatusKeyUpdate);
 
 		pageno = MXOffsetToMemberPage(offset);
-		entryno = MXOffsetToMemberEntry(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
 
 		if (pageno != prev_pageno)
 		{
@@ -789,10 +790,17 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 		}
 
 		memberptr = (TransactionId *)
-			MultiXactMemberCtl->shared->page_buffer[slotno];
-		memberptr += entryno;
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
-		*memberptr = xids[i];
+		*memberptr = members[i].xid;
+
+		flagsptr = (uint32 *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+		flagsval = *flagsptr;
+		flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+		flagsval |= (members[i].status << bshift);
+		*flagsptr = flagsval;
 
 		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
 	}
@@ -816,21 +824,18 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
  * caller must end the critical section after writing SLRU data.
  */
 static MultiXactId
-GetNewMultiXactId(int nxids, MultiXactOffset *offset)
+GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 {
 	MultiXactId result;
 	MultiXactOffset nextOffset;
 
-	debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
-
-	/* MultiXactIdSetOldestMember() must have been called already */
-	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+	debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
 
 	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 
-	/* Handle wraparound of the nextMXact counter */
-	if (MultiXactState->nextMXact < FirstMultiXactId)
-		MultiXactState->nextMXact = FirstMultiXactId;
+	/* Handle corner cases of the nextMXact counter */
+	MultiXactState->nextMXact =
+		HandleMxactOffsetCornerCases(MultiXactState->nextMXact);
 
 	/*
 	 * Assign the MXID, and make sure there is room for it in the file.
@@ -848,12 +853,12 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 	if (nextOffset == 0)
 	{
 		*offset = 1;
-		nxids++;				/* allocate member slot 0 too */
+		nmembers++;				/* allocate member slot 0 too */
 	}
 	else
 		*offset = nextOffset;
 
-	ExtendMultiXactMember(nextOffset, nxids);
+	ExtendMultiXactMember(nextOffset, nmembers);
 
 	/*
 	 * Critical section from here until caller has written the data into the
@@ -870,13 +875,14 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 	 *
 	 * We don't care about MultiXactId wraparound here; it will be handled by
 	 * the next iteration.	But note that nextMXact may be InvalidMultiXactId
-	 * after this routine exits, so anyone else looking at the variable must
-	 * be prepared to deal with that.  Similarly, nextOffset may be zero, but
-	 * we won't use that as the actual start offset of the next multixact.
+	 * or the first value on a segment-beggining page after this routine exits,
+	 * so anyone else looking at the variable must be prepared to deal with
+	 * either case.  Similarly, nextOffset may be zero, but we won't use that
+	 * as the actual start offset of the next multixact.
 	 */
 	(MultiXactState->nextMXact)++;
 
-	MultiXactState->nextOffset += nxids;
+	MultiXactState->nextOffset += nmembers;
 
 	LWLockRelease(MultiXactGenLock);
 
@@ -885,15 +891,37 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 }
 
 /*
+ * HandleMxactOffsetCornerCases
+ * 		Properly handle corner cases of MultiXactId enumeration
+ *
+ * This function takes a MultiXactId and returns a value that's actually a
+ * valid multi, that is, it skips the first two values of any segment-
+ * beginning page, which are used to store the truncateXid and
+ * truncateXidEpoch.
+ */
+static MultiXactId
+HandleMxactOffsetCornerCases(MultiXactId multi)
+{
+	if (multi < FirstMultiXactId)
+		return FirstMultiXactId;
+
+	if (MultiXactIdToOffsetEntry(multi) == 0 &&
+		multi % SLRU_PAGES_PER_SEGMENT == 0)
+		return multi + 2;
+
+	return multi;
+}
+
+/*
  * GetMultiXactIdMembers
- *		Returns the set of TransactionIds that make up a MultiXactId
+ *		Returns the set of MultiXactMembers that make up a MultiXactId
  *
  * We return -1 if the MultiXactId is too old to possibly have any members
  * still running; in that case we have not actually looked them up, and
- * *xids is not set.
+ * *members is not set.
  */
 int
-GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
+GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members)
 {
 	int			pageno;
 	int			prev_pageno;
@@ -904,64 +932,61 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
 	int			length;
 	int			truelength;
 	int			i;
+	MultiXactId oldestMXact;
 	MultiXactId nextMXact;
 	MultiXactId tmpMXact;
 	MultiXactOffset nextOffset;
-	TransactionId *ptr;
+	MultiXactMember *ptr;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
 	Assert(MultiXactIdIsValid(multi));
 
 	/* See if the MultiXactId is in the local cache */
-	length = mXactCacheGetById(multi, xids);
+	length = mXactCacheGetById(multi, members);
 	if (length >= 0)
 	{
 		debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
-					mxid_to_string(multi, length, *xids));
+					mxid_to_string(multi, length, *members));
 		return length;
 	}
 
-	/* Set our OldestVisibleMXactId[] entry if we didn't already */
-	MultiXactIdSetOldestVisible();
-
 	/*
 	 * We check known limits on MultiXact before resorting to the SLRU area.
 	 *
-	 * An ID older than our OldestVisibleMXactId[] entry can't possibly still
-	 * be running, and we'd run the risk of trying to read already-truncated
-	 * SLRU data if we did try to examine it.
+	 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
+	 * useful; it should have already been frozen by vacuum.  We've truncated
+	 * the on-disk structures anyway, so we return empty if such a value is
+	 * queried.
 	 *
 	 * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
 	 * seen, it implies undetected ID wraparound has occurred.	We just
 	 * silently assume that such an ID is no longer running.
 	 *
 	 * Shared lock is enough here since we aren't modifying any global state.
-	 * Also, we can examine our own OldestVisibleMXactId without the lock,
-	 * since no one else is allowed to change it.
-	 */
-	if (MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
-	{
-		debug_elog2(DEBUG2, "GetMembers: it's too old");
-		*xids = NULL;
-		return -1;
-	}
-
-	/*
+	 *
 	 * Acquire the shared lock just long enough to grab the current counter
 	 * values.	We may need both nextMXact and nextOffset; see below.
 	 */
 	LWLockAcquire(MultiXactGenLock, LW_SHARED);
 
+	oldestMXact = MultiXactState->oldestMultiXactId;
 	nextMXact = MultiXactState->nextMXact;
 	nextOffset = MultiXactState->nextOffset;
 
 	LWLockRelease(MultiXactGenLock);
 
+	if (MultiXactIdPrecedes(multi, oldestMXact))
+	{
+		debug_elog2(DEBUG2, "GetMembers: it's too old");
+		*members = NULL;
+		return -1;
+	}
+
 	if (!MultiXactIdPrecedes(multi, nextMXact))
 	{
 		debug_elog2(DEBUG2, "GetMembers: it's too new!");
-		*xids = NULL;
+		*members = NULL;
 		return -1;
 	}
 
@@ -1026,9 +1051,8 @@ retry:
 	{
 		MultiXactOffset nextMXOffset;
 
-		/* handle wraparound if needed */
-		if (tmpMXact < FirstMultiXactId)
-			tmpMXact = FirstMultiXactId;
+		/* Handle corner cases if needed */
+		tmpMXact = HandleMxactOffsetCornerCases(tmpMXact);
 
 		prev_pageno = pageno;
 
@@ -1055,8 +1079,8 @@ retry:
 
 	LWLockRelease(MultiXactOffsetControlLock);
 
-	ptr = (TransactionId *) palloc(length * sizeof(TransactionId));
-	*xids = ptr;
+	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
+	*members = ptr;
 
 	/* Now get the members themselves. */
 	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
@@ -1066,9 +1090,13 @@ retry:
 	for (i = 0; i < length; i++, offset++)
 	{
 		TransactionId *xactptr;
+		uint32	   *flagsptr;
+		int			flagsoff;
+		int			bshift;
+		int			memberoff;
 
 		pageno = MXOffsetToMemberPage(offset);
-		entryno = MXOffsetToMemberEntry(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
 
 		if (pageno != prev_pageno)
 		{
@@ -1077,8 +1105,7 @@ retry:
 		}
 
 		xactptr = (TransactionId *)
-			MultiXactMemberCtl->shared->page_buffer[slotno];
-		xactptr += entryno;
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
 		if (!TransactionIdIsValid(*xactptr))
 		{
@@ -1087,7 +1114,13 @@ retry:
 			continue;
 		}
 
-		ptr[truelength++] = *xactptr;
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+		flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+		ptr[truelength].xid = *xactptr;
+		ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+		truelength++;
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
@@ -1103,6 +1136,30 @@ retry:
 }
 
 /*
+ * mxactMemberComparator
+ *		qsort comparison function for MultiXactMember
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality!  Any old sort order will do.
+ */
+static int
+mxactMemberComparator(const void *arg1, const void *arg2)
+{
+	MultiXactMember member1 = *(const MultiXactMember *) arg1;
+	MultiXactMember member2 = *(const MultiXactMember *) arg2;
+
+	if (member1.xid > member2.xid)
+		return 1;
+	if (member1.xid < member2.xid)
+		return -1;
+	if (member1.status > member2.status)
+		return 1;
+	if (member1.status < member2.status)
+		return -1;
+	return 0;
+}
+
+/*
  * mXactCacheGetBySet
  *		returns a MultiXactId from the cache based on the set of
  *		TransactionIds that compose it, or InvalidMultiXactId if
@@ -1113,26 +1170,27 @@ retry:
  * for the majority of tuples, thus keeping MultiXactId usage low (saving
  * both I/O and wraparound issues).
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members array will be sorted in-place.
  */
 static MultiXactId
-mXactCacheGetBySet(int nxids, TransactionId *xids)
+mXactCacheGetBySet(int nmembers, MultiXactMember *members)
 {
 	mXactCacheEnt *entry;
 
 	debug_elog3(DEBUG2, "CacheGet: looking for %s",
-				mxid_to_string(InvalidMultiXactId, nxids, xids));
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
 
 	/* sort the array so comparison is easy */
-	qsort(xids, nxids, sizeof(TransactionId), xidComparator);
+	qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
 	for (entry = MXactCache; entry != NULL; entry = entry->next)
 	{
-		if (entry->nxids != nxids)
+		if (entry->nmembers != nmembers)
 			continue;
 
 		/* We assume the cache entries are sorted */
-		if (memcmp(xids, entry->xids, nxids * sizeof(TransactionId)) == 0)
+		/* XXX we assume the unused bits in "status" are zeroed */
+		if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
 		{
 			debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
 			return entry->multi;
@@ -1145,14 +1203,14 @@ mXactCacheGetBySet(int nxids, TransactionId *xids)
 
 /*
  * mXactCacheGetById
- *		returns the composing TransactionId set from the cache for a
+ *		returns the composing MultiXactMember set from the cache for a
  *		given MultiXactId, if present.
  *
  * If successful, *xids is set to the address of a palloc'd copy of the
- * TransactionId set.  Return value is number of members, or -1 on failure.
+ * MultiXactMember set.  Return value is number of members, or -1 on failure.
  */
 static int
-mXactCacheGetById(MultiXactId multi, TransactionId **xids)
+mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
 {
 	mXactCacheEnt *entry;
 
@@ -1162,18 +1220,18 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
 	{
 		if (entry->multi == multi)
 		{
-			TransactionId *ptr;
+			MultiXactMember *ptr;
 			Size		size;
 
-			size = sizeof(TransactionId) * entry->nxids;
-			ptr = (TransactionId *) palloc(size);
-			*xids = ptr;
+			size = sizeof(MultiXactMember) * entry->nmembers;
+			ptr = (MultiXactMember *) palloc(size);
+			*members = ptr;
 
-			memcpy(ptr, entry->xids, size);
+			memcpy(ptr, entry->members, size);
 
 			debug_elog3(DEBUG2, "CacheGet: found %s",
-						mxid_to_string(multi, entry->nxids, entry->xids));
-			return entry->nxids;
+						mxid_to_string(multi, entry->nmembers, entry->members));
+			return entry->nmembers;
 		}
 	}
 
@@ -1186,12 +1244,12 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
  *		Add a new MultiXactId and its composing set into the local cache.
  */
 static void
-mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
+mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
 {
 	mXactCacheEnt *entry;
 
 	debug_elog3(DEBUG2, "CachePut: storing %s",
-				mxid_to_string(multi, nxids, xids));
+				mxid_to_string(multi, nmembers, members));
 
 	if (MXactContext == NULL)
 	{
@@ -1206,15 +1264,15 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
 
 	entry = (mXactCacheEnt *)
 		MemoryContextAlloc(MXactContext,
-						   offsetof(mXactCacheEnt, xids) +
-						   nxids * sizeof(TransactionId));
+						   offsetof(mXactCacheEnt, members) +
+						   nmembers * sizeof(MultiXactMember));
 
 	entry->multi = multi;
-	entry->nxids = nxids;
-	memcpy(entry->xids, xids, nxids * sizeof(TransactionId));
+	entry->nmembers = nmembers;
+	memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
 
 	/* mXactCacheGetBySet assumes the entries are sorted, so sort them */
-	qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator);
+	qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
 	entry->next = MXactCache;
 	MXactCache = entry;
@@ -1222,15 +1280,38 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
 
 #ifdef MULTIXACT_DEBUG
 static char *
-mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids)
+mxstatus_to_string(MultiXactStatus status)
 {
-	char	   *str = palloc(15 * (nxids + 1) + 4);
+	switch (status)
+	{
+		case MultiXactStatusForKeyShare:
+			return "keysh";
+		case MultiXactStatusForShare:
+			return "sh";
+		case MultiXactStatusForUpdate:
+			return "forupd";
+		case MultiXactStatusUpdate:
+			return "upd";
+		case MultiXactStatusKeyUpdate:
+			return "keyup";
+		default:
+			elog(ERROR, "unrecognized multixact status %d", status);
+			return "";
+	}
+}
+
+static char *
+mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+	char	   *str = palloc(15 * (nmembers + 1) + 4);
 	int			i;
 
-	snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]);
+	snprintf(str, 47, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
+			 mxstatus_to_string(members[0].status));
 
-	for (i = 1; i < nxids; i++)
-		snprintf(str + strlen(str), 17, ", %u", xids[i]);
+	for (i = 1; i < nmembers; i++)
+		snprintf(str + strlen(str), 17, ", %u (%s)", members[i].xid,
+				 mxstatus_to_string(members[i].status));
 
 	strcat(str, "]");
 	return str;
@@ -1247,16 +1328,6 @@ void
 AtEOXact_MultiXact(void)
 {
 	/*
-	 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
-	 * which should only be valid while within a transaction.
-	 *
-	 * We assume that storing a MultiXactId is atomic and so we need not take
-	 * MultiXactGenLock to do this.
-	 */
-	OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
-	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
-
-	/*
 	 * Discard the local MultiXactId cache.  Since MXactContext was created as
 	 * a child of TopTransactionContext, we needn't delete it explicitly.
 	 */
@@ -1267,18 +1338,11 @@ AtEOXact_MultiXact(void)
 /*
  * AtPrepare_MultiXact
  *		Save multixact state at 2PC tranasction prepare
- *
- * In this phase, we only store our OldestMemberMXactId value in the two-phase
- * state file.
  */
 void
 AtPrepare_MultiXact(void)
 {
-	MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
-
-	if (MultiXactIdIsValid(myOldestMember))
-		RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
-							   &myOldestMember, sizeof(MultiXactId));
+	/* nothing to do */
 }
 
 /*
@@ -1288,41 +1352,6 @@ AtPrepare_MultiXact(void)
 void
 PostPrepare_MultiXact(TransactionId xid)
 {
-	MultiXactId myOldestMember;
-
-	/*
-	 * Transfer our OldestMemberMXactId value to the slot reserved for the
-	 * prepared transaction.
-	 */
-	myOldestMember = OldestMemberMXactId[MyBackendId];
-	if (MultiXactIdIsValid(myOldestMember))
-	{
-		BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
-
-		/*
-		 * Even though storing MultiXactId is atomic, acquire lock to make
-		 * sure others see both changes, not just the reset of the slot of the
-		 * current backend. Using a volatile pointer might suffice, but this
-		 * isn't a hot spot.
-		 */
-		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
-
-		OldestMemberMXactId[dummyBackendId] = myOldestMember;
-		OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
-
-		LWLockRelease(MultiXactGenLock);
-	}
-
-	/*
-	 * We don't need to transfer OldestVisibleMXactId value, because the
-	 * transaction is not going to be looking at any more multixacts once it's
-	 * prepared.
-	 *
-	 * We assume that storing a MultiXactId is atomic and so we need not take
-	 * MultiXactGenLock to do this.
-	 */
-	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
-
 	/*
 	 * Discard the local MultiXactId cache like in AtEOX_MultiXact
 	 */
@@ -1338,17 +1367,7 @@ void
 multixact_twophase_recover(TransactionId xid, uint16 info,
 						   void *recdata, uint32 len)
 {
-	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
-	MultiXactId oldestMember;
-
-	/*
-	 * Get the oldest member XID from the state file record, and set it in the
-	 * OldestMemberMXactId slot reserved for this prepared transaction.
-	 */
-	Assert(len == sizeof(MultiXactId));
-	oldestMember = *((MultiXactId *) recdata);
-
-	OldestMemberMXactId[dummyBackendId] = oldestMember;
+	/* nothing to do */
 }
 
 /*
@@ -1359,11 +1378,7 @@ void
 multixact_twophase_postcommit(TransactionId xid, uint16 info,
 							  void *recdata, uint32 len)
 {
-	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid);
-
-	Assert(len == sizeof(MultiXactId));
-
-	OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
+	/* nothing to do */
 }
 
 /*
@@ -1374,7 +1389,7 @@ void
 multixact_twophase_postabort(TransactionId xid, uint16 info,
 							 void *recdata, uint32 len)
 {
-	multixact_twophase_postcommit(xid, info, recdata, len);
+	/* nothing to do */
 }
 
 /*
@@ -1387,11 +1402,7 @@ MultiXactShmemSize(void)
 {
 	Size		size;
 
-#define SHARED_MULTIXACT_STATE_SIZE \
-	add_size(sizeof(MultiXactStateData), \
-			 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
-
-	size = SHARED_MULTIXACT_STATE_SIZE;
+	size = sizeof(MultiXactStateData);
 	size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
 	size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
 
@@ -1417,24 +1428,17 @@ MultiXactShmemInit(void)
 
 	/* Initialize our shared state struct */
 	MultiXactState = ShmemInitStruct("Shared MultiXact State",
-									 SHARED_MULTIXACT_STATE_SIZE,
+									 sizeof(MultiXactStateData),
 									 &found);
 	if (!IsUnderPostmaster)
 	{
 		Assert(!found);
 
 		/* Make sure we zero out the per-backend state */
-		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
+		MemSet(MultiXactState, 0, sizeof(MultiXactStateData));
 	}
 	else
 		Assert(found);
-
-	/*
-	 * Set up array pointers.  Note that perBackendXactIds[0] is wasted space
-	 * since we only use indexes 1..MaxOldestSlot in each array.
-	 */
-	OldestMemberMXactId = MultiXactState->perBackendXactIds;
-	OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
 }
 
 /*
@@ -1450,7 +1454,7 @@ BootStrapMultiXact(void)
 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
 
 	/* Create and zero the first page of the offsets log */
-	slotno = ZeroMultiXactOffsetPage(0, false);
+	slotno = ZeroMultiXactOffsetPage(0, false, InvalidTransactionId, 0);
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
@@ -1474,26 +1478,40 @@ BootStrapMultiXact(void)
  * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
  * If writeXlog is TRUE, also emit an XLOG record saying we did this.
  *
+ * If truncateXid is valid, store it in the first position of the page.
+ *
  * The page is not actually written, just set up in shared memory.
  * The slot number of the new page is returned.
  *
  * Control lock must be held at entry, and will be held at exit.
  */
 static int
-ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
+ZeroMultiXactOffsetPage(int pageno, bool writeXlog, TransactionId truncateXid,
+						uint32 truncateXidEpoch)
 {
 	int			slotno;
 
 	slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
 
 	if (writeXlog)
-		WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+		WriteMZeroOffsetPageXlogRec(pageno, truncateXid, truncateXidEpoch);
+
+	if (TransactionIdIsValid(truncateXid))
+	{
+		MultiXactOffset *offptr;
+
+		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		*(offptr++) = truncateXid;
+		*offptr = truncateXidEpoch;
+
+		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	}
 
 	return slotno;
 }
 
 /*
- * Ditto, for MultiXactMember
+ * Ditto for MultiXactMember, except these don't worry about truncation info.
  */
 static int
 ZeroMultiXactMemberPage(int pageno, bool writeXlog)
@@ -1503,7 +1521,7 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
 	slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
 
 	if (writeXlog)
-		WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+		WriteMZeroMemberPageXlogRec(pageno);
 
 	return slotno;
 }
@@ -1525,6 +1543,7 @@ StartupMultiXact(void)
 	MultiXactOffset offset = MultiXactState->nextOffset;
 	int			pageno;
 	int			entryno;
+	int			flagsoff;
 
 	/* Clean up offsets state */
 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
@@ -1569,28 +1588,30 @@ StartupMultiXact(void)
 	 * Zero out the remainder of the current members page.	See notes in
 	 * TrimCLOG() for motivation.
 	 */
-	entryno = MXOffsetToMemberEntry(offset);
-	if (entryno != 0)
+	flagsoff = MXOffsetToFlagsOffset(offset);
+	if (flagsoff != 0)
 	{
 		int			slotno;
 		TransactionId *xidptr;
+		int			memberoff;
 
+		memberoff = MXOffsetToMemberOffset(offset);
 		slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
-		xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
-		xidptr += entryno;
+		xidptr = (TransactionId *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
-		MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId)));
+		MemSet(xidptr, 0, BLCKSZ - memberoff);
+
+		/*
+		 * Note: we don't need to zero out the flag bits in the remaining
+		 * members of the current group, because they are always reset before
+		 * writing.
+		 */
 
 		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
-
-	/*
-	 * Initialize lastTruncationPoint to invalid, ensuring that the first
-	 * checkpoint will try to do truncation.
-	 */
-	MultiXactState->lastTruncationPoint = InvalidMultiXactId;
 }
 
 /*
@@ -1607,22 +1628,31 @@ ShutdownMultiXact(void)
 }
 
 /*
- * Get the next MultiXactId and offset to save in a checkpoint record
+ * Get the next MultiXactId, offset and truncate info to save in a checkpoint
+ * record
  */
 void
 MultiXactGetCheckptMulti(bool is_shutdown,
 						 MultiXactId *nextMulti,
-						 MultiXactOffset *nextMultiOffset)
+						 MultiXactOffset *nextMultiOffset,
+						 TransactionId *oldestTruncateXid,
+						 uint32 *oldestTruncateXidEpoch,
+						 MultiXactId *oldestMulti)
 {
 	LWLockAcquire(MultiXactGenLock, LW_SHARED);
 
 	*nextMulti = MultiXactState->nextMXact;
 	*nextMultiOffset = MultiXactState->nextOffset;
+	*oldestTruncateXid = MultiXactState->truncateXid;
+	*oldestTruncateXidEpoch = MultiXactState->truncateXidEpoch;
+	*oldestMulti = MultiXactState->oldestMultiXactId;
 
 	LWLockRelease(MultiXactGenLock);
 
-	debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u",
-				*nextMulti, *nextMultiOffset);
+	debug_elog7(DEBUG2,
+				"MultiXact: checkpoint is nextMulti %u, nextOffset %u; truncate xid %u, epoch %u; oldest multi %u",
+				*nextMulti, *nextMultiOffset, *oldestTruncateXid,
+				*oldestTruncateXidEpoch, *oldestMulti);
 }
 
 /*
@@ -1637,17 +1667,6 @@ CheckPointMultiXact(void)
 	SimpleLruFlush(MultiXactOffsetCtl, true);
 	SimpleLruFlush(MultiXactMemberCtl, true);
 
-	/*
-	 * Truncate the SLRU files.  This could be done at any time, but
-	 * checkpoint seems a reasonable place for it.	There is one exception: if
-	 * we are called during xlog recovery, then shared->latest_page_number
-	 * isn't valid (because StartupMultiXact hasn't been called yet) and so
-	 * SimpleLruTruncate would get confused.  It seems best not to risk
-	 * removing any data during recovery anyway, so don't truncate.
-	 */
-	if (!RecoveryInProgress())
-		TruncateMultiXact();
-
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
 
@@ -1670,7 +1689,7 @@ MultiXactSetNextMXact(MultiXactId nextMulti,
 
 /*
  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
- * and similarly nextOffset is at least minMultiOffset
+ * and similarly nextOffset is at least minMultiOffset.
  *
  * This is used when we can determine minimum safe values from an XLog
  * record (either an on-line checkpoint or an mxact creation log entry).
@@ -1696,6 +1715,9 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti,
 /*
  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
  *
+ * If the newly allocated page is the first page on the segment, store an
+ * appropriate truncate Xid value in the page first position.
+ *
  * NB: this is called while holding MultiXactGenLock.  We want it to be very
  * fast most of the time; even when it's not so fast, no actual I/O need
  * happen unless we're forced to write out a dirty log or xlog page to make
@@ -1705,6 +1727,8 @@ static void
 ExtendMultiXactOffset(MultiXactId multi)
 {
 	int			pageno;
+	TransactionId truncateXid;
+	uint32		truncateXidEpoch;
 
 	/*
 	 * No work except at first MultiXactId of a page.  But beware: just after
@@ -1716,12 +1740,49 @@ ExtendMultiXactOffset(MultiXactId multi)
 
 	pageno = MultiXactIdToOffsetPage(multi);
 
-	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+	/*
+	 * Determine the truncateXid and epoch that the new segment needs, if
+	 * this is the first page of the segment.
+	 */
+	if (pageno % SLRU_PAGES_PER_SEGMENT == 0)
+	{
+		TransactionId	nextXid;
+
+		Assert(TransactionIdIsValid(RecentGlobalXmin));
+		truncateXid = RecentGlobalXmin;
+
+		GetNextXidAndEpoch(&nextXid, &truncateXidEpoch);
+		/*
+		 * nextXid is certainly logically later than RecentGlobalXmin.  So if
+		 * it's numerically less, it must have wrapped into the next epoch.
+		 */
+		if (nextXid < truncateXid)
+			truncateXidEpoch--;
+	}
+	else
+	{
+		truncateXid = InvalidTransactionId;
+		truncateXidEpoch = 0;
+	}
 
-	/* Zero the page and make an XLOG entry about it */
-	ZeroMultiXactOffsetPage(pageno, true);
+	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
 
+	/*
+	 * Zero the page, mark it with its truncate info, and make an XLOG entry
+	 * about it.
+	 */
+	ZeroMultiXactOffsetPage(pageno, true, truncateXid, truncateXidEpoch);
 	LWLockRelease(MultiXactOffsetControlLock);
+
+	/*
+	 * Finally, record the new truncation point in shared memory, if
+	 * there isn't one already.
+	 */
+	if (!TransactionIdIsValid(MultiXactState->truncateXid))
+	{
+		MultiXactState->truncateXid = truncateXid;
+		MultiXactState->truncateXidEpoch = truncateXidEpoch;
+	}
 }
 
 /*
@@ -1742,13 +1803,16 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 	 */
 	while (nmembers > 0)
 	{
-		int			entryno;
+		int			flagsoff;
+		int			flagsbit;
+		int			difference;
 
 		/*
 		 * Only zero when at first entry of a page.
 		 */
-		entryno = MXOffsetToMemberEntry(offset);
-		if (entryno == 0)
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		flagsbit = MXOffsetToFlagsBitShift(offset);
+		if (flagsoff == 0 && flagsbit == 0)
 		{
 			int			pageno;
 
@@ -1763,122 +1827,241 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 		}
 
 		/* Advance to next page (OK if nmembers goes negative) */
-		offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno);
-		nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno);
+		difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+		offset += difference;
+		nmembers -= difference;
+	}
+}
+
+/*
+ * Complete a SegmentInfo with the truncate Xid and epoch, as read from its
+ * first page.
+ */
+static void
+fillSegmentInfoData(SlruCtl ctl, SegmentInfo *segment)
+{
+	int			slotno;
+	MultiXactId *offptr;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	/* FIXME it'd be nice not to trash the entire SLRU cache while at this */
+	slotno = SimpleLruReadPage_ReadOnly(ctl, segment->segno, InvalidTransactionId);
+	offptr = (MultiXactId *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+	segment->truncateXid = *offptr;
+	offptr++;
+	segment->truncateXidEpoch = *offptr;
+	offptr++;
+	segment->firstOffset = *offptr;
+	LWLockRelease(ctl->shared->ControlLock);
+}
+
+/* SegmentInfo comparator, for qsort and bsearch */
+static int
+compareTruncateXidEpoch(const void *a, const void *b)
+{
+	const SegmentInfo *sega = (const SegmentInfo *) a;
+	const SegmentInfo *segb = (const SegmentInfo *) b;
+	uint32	epocha = sega->truncateXidEpoch;
+	uint32	epochb = segb->truncateXidEpoch;
+	TransactionId	xida = sega->truncateXid;
+	TransactionId	xidb = segb->truncateXid;
+
+	if (epocha < epochb)
+		return -1;
+	if (epocha > epochb)
+		return 1;
+	if (xida < xidb)
+		return -1;
+	if (xida > xidb)
+		return 1;
+	return 0;
+}
+
+/*
+ * SlruScanDirectory callback
+ * 		This callback is in charge of scanning all existing segments,
+ * 		to determine their respective truncation points.
+ *
+ * This does not delete any segments.
+ */
+static bool
+mxactSlruGathererCb(SlruCtl ctl, char *segname, int segpage,
+					void *data)
+{
+	TruncateCbData *truncdata = (TruncateCbData *) data;
+	SegmentInfo		seg;
+
+	/*
+	 * Keep track of the truncate Xid and other data for the caller to sort out
+	 * the new truncation point.
+	 */
+	seg.segno = segpage % SLRU_PAGES_PER_SEGMENT;
+	fillSegmentInfoData(ctl, &seg);
+
+	if (truncdata->remaining == NULL)
+	{
+		truncdata->remaining_alloc = 8;
+		truncdata->remaining_used = 0;
+		truncdata->remaining = palloc(truncdata->remaining_alloc *
+									  sizeof(SegmentInfo));
 	}
+	else if (truncdata->remaining_used == truncdata->remaining_alloc - 1)
+	{
+		truncdata->remaining_alloc *= 2;
+		truncdata->remaining = repalloc(truncdata->remaining,
+										truncdata->remaining_alloc);
+	}
+	truncdata->remaining[truncdata->remaining_used++] = seg;
+
+	return false;	/* keep going */
 }
 
 /*
  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
  * ones still of interest.
  *
+ * The truncation rules for the Offset SLRU area are:
+ *
+ * 1. the current segment is never to be deleted.
+ * 2. for all the remaining segments, keep track of their respective number
+ *    and truncate Xid info.  The caller is to determine the new truncation
+ *    point from this data.
+ *
  * This is called only during checkpoints.	We assume no more than one
  * backend does this at a time.
  *
  * XXX do we have any issues with needing to checkpoint here?
  */
-static void
-TruncateMultiXact(void)
+void
+TruncateMultiXact(TransactionId frozenXid)
 {
-	MultiXactId nextMXact;
-	MultiXactOffset nextOffset;
-	MultiXactId oldestMXact;
-	MultiXactOffset oldestOffset;
+	TransactionId	currentXid;
+	uint32		frozenXidEpoch;
+	TruncateCbData	truncdata;
+	SegmentInfo *truncateSegment;
+	SegmentInfo	frozenPosition;
 	int			cutoffPage;
 	int			i;
+	TransactionId	newTruncateXid;
+	int		newTruncateXidEpoch;
 
 	/*
-	 * First, compute where we can safely truncate.  Per notes above, this is
-	 * the oldest valid value among all the OldestMemberMXactId[] and
-	 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
+	 * Quick exit #1: if the truncateXid is not valid, bail out.  We do this
+	 * check without a lock so that it's fast in the common case when there's
+	 * only one segment (which cannot be removed).  If a concurrent backend is
+	 * creating a new segment, no problem: it just means we delay removing
+	 * files until we're next called.  This assumes that storing an aligned
+	 * 32-bit value is atomic.
 	 */
-	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	if (!TransactionIdIsValid(MultiXactState->truncateXid))
+		return;
 
 	/*
-	 * We have to beware of the possibility that nextMXact is in the
-	 * wrapped-around state.  We don't fix the counter itself here, but we
-	 * must be sure to use a valid value in our calculation.
+	 * Compute the epoch corresponding to the frozenXid value we were given.
+	 *
+	 * The current Xid value must be logically newer than frozenXid, so if it's
+	 * numerically lower, it must belong to the next epoch.
 	 */
-	nextMXact = MultiXactState->nextMXact;
-	if (nextMXact < FirstMultiXactId)
-		nextMXact = FirstMultiXactId;
+	GetNextXidAndEpoch(&currentXid, &frozenXidEpoch);
+	if (currentXid < frozenXid)
+		frozenXidEpoch--;
 
-	oldestMXact = nextMXact;
-	for (i = 1; i <= MaxOldestSlot; i++)
+	/*
+	 * Quick exit #2: the oldest segment is not yet old enough to be removed.
+	 * In that case we don't need to scan the whole directory.
+	 */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	Assert(frozenXidEpoch >= MultiXactState->truncateXidEpoch);
+	if ((frozenXidEpoch == MultiXactState->truncateXidEpoch) &&
+		(frozenXid < MultiXactState->truncateXid))
 	{
-		MultiXactId thisoldest;
-
-		thisoldest = OldestMemberMXactId[i];
-		if (MultiXactIdIsValid(thisoldest) &&
-			MultiXactIdPrecedes(thisoldest, oldestMXact))
-			oldestMXact = thisoldest;
-		thisoldest = OldestVisibleMXactId[i];
-		if (MultiXactIdIsValid(thisoldest) &&
-			MultiXactIdPrecedes(thisoldest, oldestMXact))
-			oldestMXact = thisoldest;
+		LWLockRelease(MultiXactGenLock);
+		return;
 	}
-
-	/* Save the current nextOffset too */
-	nextOffset = MultiXactState->nextOffset;
-
 	LWLockRelease(MultiXactGenLock);
 
-	debug_elog3(DEBUG2, "MultiXact: truncation point = %u", oldestMXact);
-
 	/*
-	 * If we already truncated at this point, do nothing.  This saves time
-	 * when no MultiXacts are getting used, which is probably not uncommon.
+	 * Have our callback scan the SLRU directory to let us determine the
+	 * truncation point.
 	 */
-	if (MultiXactState->lastTruncationPoint == oldestMXact)
-		return;
+	truncdata.remaining_used = 0;
+	truncdata.remaining_alloc = 0;
+	truncdata.remaining = NULL;
+	SlruScanDirectory(MultiXactOffsetCtl, mxactSlruGathererCb, &truncdata);
 
 	/*
-	 * We need to determine where to truncate MultiXactMember.	If we found a
-	 * valid oldest MultiXactId, read its starting offset; otherwise we use
-	 * the nextOffset value we saved above.
+	 * Determine the maximum segment whose truncateXid is less than the
+	 * truncate point.
 	 */
-	if (oldestMXact == nextMXact)
-		oldestOffset = nextOffset;
-	else
+	frozenPosition.truncateXid = frozenXid;
+	frozenPosition.truncateXidEpoch = frozenXidEpoch;
+	truncateSegment = NULL;
+	for (i = 0; i < truncdata.remaining_used; i++)
 	{
-		int			pageno;
-		int			slotno;
-		int			entryno;
-		MultiXactOffset *offptr;
+		if ((compareTruncateXidEpoch(&frozenPosition,
+									 &(truncdata.remaining[i])) > 0) &&
+			(truncateSegment->segno < truncdata.remaining[i].segno))
+		{
+			truncateSegment = &(truncdata.remaining[i]);
+		}
+	}
 
-		/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	/*
+	 * Nothing to delete? This shouldn't happen, due to quick exit #2 above,
+	 * but we'd better cope.
+	 */
+	if (truncateSegment == NULL)
+		return;
 
-		pageno = MultiXactIdToOffsetPage(oldestMXact);
-		entryno = MultiXactIdToOffsetEntry(oldestMXact);
+	/* truncate MultiXactOffset */
+	SimpleLruTruncate(MultiXactOffsetCtl, firstPageOf(truncateSegment->segno));
 
-		slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, oldestMXact);
-		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
-		offptr += entryno;
-		oldestOffset = *offptr;
+	/*
+	 * And truncate MultiXactMember at the first offset used by the oldest
+	 * remaining segment.
+	 */
+	cutoffPage = MXOffsetToMemberPage(truncateSegment->firstOffset);
 
-		LWLockRelease(MultiXactOffsetControlLock);
-	}
+	SimpleLruTruncate(MultiXactMemberCtl, cutoffPage);
 
 	/*
-	 * The cutoff point is the start of the segment containing oldestMXact. We
-	 * pass the *page* containing oldestMXact to SimpleLruTruncate.
+	 * Finally, update shared memory to keep track of the next usable
+	 * truncation point, if any.  If the truncation point for offsets was the
+	 * last remaining segment, then there's no next truncation point: it will
+	 * be set when the next segment is created.  Otherwise, the second
+	 * remaining segment determines the next truncation point.
 	 */
-	cutoffPage = MultiXactIdToOffsetPage(oldestMXact);
+	newTruncateXid = InvalidTransactionId;
+	newTruncateXidEpoch = 0;
+	for (i = 0; i < truncdata.remaining_used; i++)
+	{
+		if (truncdata.remaining[i].segno == truncateSegment->segno + 1)
+		{
+			newTruncateXid = truncdata.remaining[i].truncateXid;
+			newTruncateXidEpoch = truncdata.remaining[i].truncateXidEpoch;
+			break;
+		}
+	}
 
-	SimpleLruTruncate(MultiXactOffsetCtl, cutoffPage);
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 
 	/*
-	 * Also truncate MultiXactMember at the previously determined offset.
+	 * FIXME there's a race condition here: somebody might have created a new
+	 * segment after we finished scanning the dir.  That scenario would leave
+	 * us with an invalid truncateXid in shared memory, which is not an easy
+	 * situation to get out of.  Needs more thought.
 	 */
-	cutoffPage = MXOffsetToMemberPage(oldestOffset);
 
-	SimpleLruTruncate(MultiXactMemberCtl, cutoffPage);
+	MultiXactState->truncateXid = newTruncateXid;
+	MultiXactState->truncateXidEpoch = newTruncateXidEpoch;
 
 	/*
-	 * Set the last known truncation point.  We don't need a lock for this
-	 * since only one backend does checkpoints at a time.
+	 * we also set the oldest visible MultiXactId to the frozenXid value we
+	 * were given; although the segments we kept may have values earlier than
+	 * that, they are not supposed to remain on disk anyway.
 	 */
-	MultiXactState->lastTruncationPoint = oldestMXact;
+	MultiXactState->oldestMultiXactId = frozenXid;
+	LWLockRelease(MultiXactGenLock);
 }
 
 /*
@@ -1947,13 +2130,29 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
 	return (diff < 0);
 }
 
+static void
+WriteMZeroOffsetPageXlogRec(int pageno, TransactionId truncateXid,
+							uint32 truncateXidEpoch)
+{
+	XLogRecData	rdata;
+	MxactZeroOffPg zerooff;
+
+	zerooff.pageno = pageno;
+	zerooff.truncateXid = truncateXid;
+	zerooff.truncateXidEpoch = truncateXidEpoch;
+
+	rdata.data = (char *) (&zerooff);
+	rdata.len = sizeof(MxactZeroOffPg);
+	rdata.buffer = InvalidBuffer;
+	rdata.next = NULL;
+	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, &rdata);
+}
 
 /*
- * Write an xlog record reflecting the zeroing of either a MEMBERs or
- * OFFSETs page (info shows which)
+ * Write an xlog record reflecting the zeroing of either a MEMBERs page.
  */
 static void
-WriteMZeroPageXlogRec(int pageno, uint8 info)
+WriteMZeroMemberPageXlogRec(int pageno)
 {
 	XLogRecData rdata;
 
@@ -1961,7 +2160,7 @@ WriteMZeroPageXlogRec(int pageno, uint8 info)
 	rdata.len = sizeof(int);
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
-	(void) XLogInsert(RM_MULTIXACT_ID, info, &rdata);
+	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_MEM_PAGE, &rdata);
 }
 
 /*
@@ -1977,18 +2176,25 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 
 	if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
 	{
-		int			pageno;
+		MxactZeroOffPg zerooff;
 		int			slotno;
 
-		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+		memcpy(&zerooff, XLogRecGetData(record), sizeof(MxactZeroOffPg));
 
 		LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
 
-		slotno = ZeroMultiXactOffsetPage(pageno, false);
+		slotno = ZeroMultiXactOffsetPage(zerooff.pageno, false,
+										 zerooff.truncateXid,
+										 zerooff.truncateXidEpoch);
 		SimpleLruWritePage(MultiXactOffsetCtl, slotno);
 		Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
 
 		LWLockRelease(MultiXactOffsetControlLock);
+
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		if (!TransactionIdIsValid(MultiXactState->truncateXid))
+			MultiXactState->truncateXid = zerooff.truncateXid;
+		LWLockRelease(MultiXactGenLock);
 	}
 	else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
 	{
@@ -2008,15 +2214,18 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 	else if (info == XLOG_MULTIXACT_CREATE_ID)
 	{
 		xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record);
-		TransactionId *xids = xlrec->xids;
+		MultiXactMember *members = xlrec->members;
 		TransactionId max_xid;
 		int			i;
 
 		/* Store the data back into the SLRU files */
-		RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids);
+		RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, members);
 
-		/* Make sure nextMXact/nextOffset are beyond what this record has */
-		MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids);
+		/*
+		 * Make sure nextMXact/nextOffset are beyond what this record has.
+		 * We cannot compute a truncateXid from this.
+		 */
+		MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nmembers);
 
 		/*
 		 * Make sure nextXid is beyond any XID mentioned in the record. This
@@ -2024,10 +2233,10 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 		 * evidence in the XLOG, but let's be safe.
 		 */
 		max_xid = record->xl_xid;
-		for (i = 0; i < xlrec->nxids; i++)
+		for (i = 0; i < xlrec->nmembers; i++)
 		{
-			if (TransactionIdPrecedes(max_xid, xids[i]))
-				max_xid = xids[i];
+			if (TransactionIdPrecedes(max_xid, members[i].xid))
+				max_xid = members[i].xid;
 		}
 
 		/*
@@ -2055,10 +2264,13 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
 
 	if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
 	{
-		int			pageno;
+		MxactZeroOffPg zerooff;
 
-		memcpy(&pageno, rec, sizeof(int));
-		appendStringInfo(buf, "zero offsets page: %d", pageno);
+		memcpy(&zerooff, XLogRecGetData(rec), sizeof(MxactZeroOffPg));
+		appendStringInfo(buf, "zero offsets page: %d truncate: %u/%u",
+						 zerooff.pageno,
+						 zerooff.truncateXidEpoch,
+						 zerooff.truncateXid);
 	}
 	else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
 	{
@@ -2072,10 +2284,11 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
 		xl_multixact_create *xlrec = (xl_multixact_create *) rec;
 		int			i;
 
+		/* XXX describe status too? */
 		appendStringInfo(buf, "create multixact %u offset %u:",
 						 xlrec->mid, xlrec->moff);
-		for (i = 0; i < xlrec->nxids; i++)
-			appendStringInfo(buf, " %u", xlrec->xids[i]);
+		for (i = 0; i < xlrec->nmembers; i++)
+			appendStringInfo(buf, " %u", xlrec->members[i].xid);
 	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 85f79b9..facf6f0 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7792,7 +7792,10 @@ CreateCheckPoint(int flags)
 
 	MultiXactGetCheckptMulti(shutdown,
 							 &checkPoint.nextMulti,
-							 &checkPoint.nextMultiOffset);
+							 &checkPoint.nextMultiOffset,
+							 &checkPoint.oldestSegTruncateXid,
+							 &checkPoint.oldestSegTruncateXidEpoch,
+							 &checkPoint.oldestMultiXactId);
 
 	/*
 	 * Having constructed the checkpoint record, ensure all shmem disk buffers
@@ -7930,6 +7933,15 @@ CreateCheckPoint(int flags)
 	if (!RecoveryInProgress())
 		TruncateSUBTRANS(GetOldestXmin(true, false));
 
+	/*
+	 * Also truncate pg_multixact if possible.  We can throw away all data
+	 * before the oldestXid value used by the most recent vacuum.  As with
+	 * subtrans, skip doing this during recovery, because StartupMultiXact
+	 * hasn't been called yet.
+	 */
+	if (!RecoveryInProgress())
+		TruncateMultiXact(checkPoint.oldestXid);
+
 	/* All real work is done, but log before releasing lock. */
 	if (log_checkpoints)
 		LogCheckpointEnd(false);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 99e130c..078073a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3001,7 +3001,7 @@ reindex_relation(Oid relid, int flags)
 
 	/* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */
 	if (is_pg_class)
-		(void) RelationGetIndexAttrBitmap(rel);
+		(void) RelationGetIndexAttrBitmap(rel, false);
 
 	PG_TRY();
 	{
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 32985a4..82f1aa7 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1150,6 +1150,7 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
 					 * right.  (Note: this works out properly when the row was
 					 * both inserted and deleted in our xact.)
 					 */
+					Assert(!(targtuple.t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 					if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targtuple.t_data)))
 						deadrows += 1;
 					else
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 54660f4..5d0cd9e 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -1090,6 +1090,7 @@ read_info(SeqTable elm, Relation rel, Buffer *buf)
 	 * bit update, ie, don't bother to WAL-log it, since we can certainly do
 	 * this again if the update gets lost.
 	 */
+	Assert(!(tuple.t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 	if (HeapTupleHeaderGetXmax(tuple.t_data) != InvalidTransactionId)
 	{
 		HeapTupleHeaderSetXmax(tuple.t_data, InvalidTransactionId);
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index a6e7268..7c1586f 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2578,7 +2578,7 @@ ltrmark:;
 		test = heap_lock_tuple(relation, &tuple, &buffer,
 							   &update_ctid, &update_xmax,
 							   estate->es_output_cid,
-							   LockTupleExclusive, false);
+							   LockTupleUpdate, false);
 		switch (test)
 		{
 			case HeapTupleSelfUpdated:
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f42504c..37a1ca8 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -680,7 +680,7 @@ vac_update_datfrozenxid(void)
 	 * Initialize the "min" calculation with GetOldestXmin, which is a
 	 * reasonable approximation to the minimum relfrozenxid for not-yet-
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
-	 * Se we cannot produce a wrong minimum by starting with this.
+	 * So we cannot produce a wrong minimum by starting with this.
 	 */
 	newFrozenXid = GetOldestXmin(true, true);
 
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index fd7a9ed..d018a95 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -800,7 +800,7 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 	}
 
 	/*
-	 * Similarly, we have to lock relations selected FOR UPDATE/FOR SHARE
+	 * Similarly, we have to lock relations selected FOR UPDATE/SHARE/KEY SHARE
 	 * before we initialize the plan tree, else we'd be risking lock upgrades.
 	 * While we are at it, build the ExecRowMark list.
 	 */
@@ -820,6 +820,7 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 		{
 			case ROW_MARK_EXCLUSIVE:
 			case ROW_MARK_SHARE:
+			case ROW_MARK_KEYSHARE:
 				relid = getrelid(rc->rti, rangeTable);
 				relation = heap_open(relid, RowShareLock);
 				break;
@@ -1691,7 +1692,7 @@ EvalPlanQual(EState *estate, EPQState *epqstate,
 	/*
 	 * Get and lock the updated version of the row; if fail, return NULL.
 	 */
-	copyTuple = EvalPlanQualFetch(estate, relation, LockTupleExclusive,
+	copyTuple = EvalPlanQualFetch(estate, relation, LockTupleUpdate,
 								  tid, priorXmax);
 
 	if (copyTuple == NULL)
@@ -1929,7 +1930,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
 		/* updated, so look at the updated row */
 		tuple.t_self = tuple.t_data->t_ctid;
 		/* updated row should have xmin matching this xmax */
-		priorXmax = HeapTupleHeaderGetXmax(tuple.t_data);
+		priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data);
 		ReleaseBuffer(buffer);
 		/* loop back to fetch next in chain */
 	}
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
index 0c48b6b..892fee5 100644
--- a/src/backend/executor/nodeLockRows.c
+++ b/src/backend/executor/nodeLockRows.c
@@ -111,10 +111,22 @@ lnext:
 		tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
 
 		/* okay, try to lock the tuple */
-		if (erm->markType == ROW_MARK_EXCLUSIVE)
-			lockmode = LockTupleExclusive;
-		else
-			lockmode = LockTupleShared;
+		switch (erm->markType)
+		{
+			case ROW_MARK_EXCLUSIVE:
+				lockmode = LockTupleUpdate;
+				break;
+			case ROW_MARK_SHARE:
+				lockmode = LockTupleShare;
+				break;
+			case ROW_MARK_KEYSHARE:
+				lockmode = LockTupleKeyShare;
+				break;
+			default:
+				elog(ERROR, "unsupported rowmark type");
+				lockmode = LockTupleUpdate;	/* keep compiler quiet */
+				break;
+		}
 
 		test = heap_lock_tuple(erm->relation, &tuple, &buffer,
 							   &update_ctid, &update_xmax,
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 63958c3..4345e84 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2028,7 +2028,7 @@ _copyRowMarkClause(RowMarkClause *from)
 	RowMarkClause *newnode = makeNode(RowMarkClause);
 
 	COPY_SCALAR_FIELD(rti);
-	COPY_SCALAR_FIELD(forUpdate);
+	COPY_SCALAR_FIELD(strength);
 	COPY_SCALAR_FIELD(noWait);
 	COPY_SCALAR_FIELD(pushedDown);
 
@@ -2387,7 +2387,7 @@ _copyLockingClause(LockingClause *from)
 	LockingClause *newnode = makeNode(LockingClause);
 
 	COPY_NODE_FIELD(lockedRels);
-	COPY_SCALAR_FIELD(forUpdate);
+	COPY_SCALAR_FIELD(strength);
 	COPY_SCALAR_FIELD(noWait);
 
 	return newnode;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index f3a34a1..0f3f914 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2300,7 +2300,7 @@ static bool
 _equalLockingClause(LockingClause *a, LockingClause *b)
 {
 	COMPARE_NODE_FIELD(lockedRels);
-	COMPARE_SCALAR_FIELD(forUpdate);
+	COMPARE_SCALAR_FIELD(strength);
 	COMPARE_SCALAR_FIELD(noWait);
 
 	return true;
@@ -2371,7 +2371,7 @@ static bool
 _equalRowMarkClause(RowMarkClause *a, RowMarkClause *b)
 {
 	COMPARE_SCALAR_FIELD(rti);
-	COMPARE_SCALAR_FIELD(forUpdate);
+	COMPARE_SCALAR_FIELD(strength);
 	COMPARE_SCALAR_FIELD(noWait);
 	COMPARE_SCALAR_FIELD(pushedDown);
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f7d39ed..5340c07 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2077,7 +2077,7 @@ _outLockingClause(StringInfo str, LockingClause *node)
 	WRITE_NODE_TYPE("LOCKINGCLAUSE");
 
 	WRITE_NODE_FIELD(lockedRels);
-	WRITE_BOOL_FIELD(forUpdate);
+	WRITE_ENUM_FIELD(strength, LockClauseStrength);
 	WRITE_BOOL_FIELD(noWait);
 }
 
@@ -2255,7 +2255,7 @@ _outRowMarkClause(StringInfo str, RowMarkClause *node)
 	WRITE_NODE_TYPE("ROWMARKCLAUSE");
 
 	WRITE_UINT_FIELD(rti);
-	WRITE_BOOL_FIELD(forUpdate);
+	WRITE_ENUM_FIELD(strength, LockClauseStrength);
 	WRITE_BOOL_FIELD(noWait);
 	WRITE_BOOL_FIELD(pushedDown);
 }
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 29a0e8f..7c08964 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -301,7 +301,7 @@ _readRowMarkClause(void)
 	READ_LOCALS(RowMarkClause);
 
 	READ_UINT_FIELD(rti);
-	READ_BOOL_FIELD(forUpdate);
+	READ_ENUM_FIELD(strength, LockClauseStrength);
 	READ_BOOL_FIELD(noWait);
 	READ_BOOL_FIELD(pushedDown);
 
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 5b170b3..81b0be1 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -564,11 +564,11 @@ make_outerjoininfo(PlannerInfo *root,
 	Assert(jointype != JOIN_RIGHT);
 
 	/*
-	 * Presently the executor cannot support FOR UPDATE/SHARE marking of rels
+	 * Presently the executor cannot support FOR UPDATE/SHARE/KEY SHARE marking of rels
 	 * appearing on the nullable side of an outer join. (It's somewhat unclear
 	 * what that would mean, anyway: what should we mark when a result row is
 	 * generated from no element of the nullable relation?)  So, complain if
-	 * any nullable rel is FOR UPDATE/SHARE.
+	 * any nullable rel is FOR UPDATE/SHARE/KEY SHARE.
 	 *
 	 * You might be wondering why this test isn't made far upstream in the
 	 * parser.	It's because the parser hasn't got enough info --- consider
@@ -586,7 +586,7 @@ make_outerjoininfo(PlannerInfo *root,
 			(jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels)))
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to the nullable side of an outer join")));
+					 errmsg("SELECT FOR UPDATE/SHARE/KEY SHARE cannot be applied to the nullable side of an outer join")));
 	}
 
 	sjinfo->syn_lefthand = left_rels;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 5c18b72..5c83d10 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1927,7 +1927,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	if (parse->rowMarks)
 	{
 		/*
-		 * We've got trouble if FOR UPDATE/SHARE appears inside grouping,
+		 * We've got trouble if FOR UPDATE/SHARE/KEY SHARE appears inside grouping,
 		 * since grouping renders a reference to individual tuple CTIDs
 		 * invalid.  This is also checked at parse time, but that's
 		 * insufficient because of rule substitution, query pullup, etc.
@@ -1937,7 +1937,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	else
 	{
 		/*
-		 * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE.
+		 * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE/KEY SHARE.
 		 */
 		if (parse->commandType != CMD_UPDATE &&
 			parse->commandType != CMD_DELETE)
@@ -1947,7 +1947,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	/*
 	 * We need to have rowmarks for all base relations except the target. We
 	 * make a bitmapset of all base rels and then remove the items we don't
-	 * need or have FOR UPDATE/SHARE marks for.
+	 * need or have FOR UPDATE/SHARE/KEY SHARE marks for.
 	 */
 	rels = get_base_rel_indexes((Node *) parse->jointree);
 	if (parse->resultRelation)
@@ -1984,10 +1984,20 @@ preprocess_rowmarks(PlannerInfo *root)
 		newrc = makeNode(PlanRowMark);
 		newrc->rti = newrc->prti = rc->rti;
 		newrc->rowmarkId = ++(root->glob->lastRowMarkId);
-		if (rc->forUpdate)
-			newrc->markType = ROW_MARK_EXCLUSIVE;
-		else
-			newrc->markType = ROW_MARK_SHARE;
+		switch (rc->strength)
+		{
+			case LCS_FORUPDATE:
+				newrc->markType = ROW_MARK_EXCLUSIVE;
+				break;
+			case LCS_FORSHARE:
+				newrc->markType = ROW_MARK_SHARE;
+				break;
+			case LCS_FORKEYSHARE:
+				newrc->markType = ROW_MARK_KEYSHARE;
+				break;
+			default:
+				elog(ERROR, "unsupported rowmark type %d", rc->strength);
+		}
 		newrc->noWait = rc->noWait;
 		newrc->isParent = false;
 
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index e4a4e3a..e2ff39f 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -2310,7 +2310,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 	/* make a clause we can pass down to subqueries to select all rels */
 	allrels = makeNode(LockingClause);
 	allrels->lockedRels = NIL;	/* indicates all rels */
-	allrels->forUpdate = lc->forUpdate;
+	allrels->strength = lc->strength;
 	allrels->noWait = lc->noWait;
 
 	if (lockedRels == NIL)
@@ -2329,12 +2329,12 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 					if (rte->relkind == RELKIND_FOREIGN_TABLE)
 						break;
 					applyLockingClause(qry, i,
-									   lc->forUpdate, lc->noWait, pushedDown);
+									   lc->strength, lc->noWait, pushedDown);
 					rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 					break;
 				case RTE_SUBQUERY:
 					applyLockingClause(qry, i,
-									   lc->forUpdate, lc->noWait, pushedDown);
+									   lc->strength, lc->noWait, pushedDown);
 
 					/*
 					 * FOR UPDATE/SHARE of subquery is propagated to all of
@@ -2384,13 +2384,13 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 											 rte->eref->aliasname),
 									  parser_errposition(pstate, thisrel->location)));
 							applyLockingClause(qry, i,
-											   lc->forUpdate, lc->noWait,
+											   lc->strength, lc->noWait,
 											   pushedDown);
 							rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 							break;
 						case RTE_SUBQUERY:
 							applyLockingClause(qry, i,
-											   lc->forUpdate, lc->noWait,
+											   lc->strength, lc->noWait,
 											   pushedDown);
 							/* see comment above */
 							transformLockingClause(pstate, rte->subquery,
@@ -2443,7 +2443,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
  */
 void
 applyLockingClause(Query *qry, Index rtindex,
-				   bool forUpdate, bool noWait, bool pushedDown)
+				   LockClauseStrength strength, bool noWait, bool pushedDown)
 {
 	RowMarkClause *rc;
 
@@ -2455,10 +2455,10 @@ applyLockingClause(Query *qry, Index rtindex,
 	if ((rc = get_parse_rowmark(qry, rtindex)) != NULL)
 	{
 		/*
-		 * If the same RTE is specified both FOR UPDATE and FOR SHARE, treat
-		 * it as FOR UPDATE.  (Reasonable, since you can't take both a shared
-		 * and exclusive lock at the same time; it'll end up being exclusive
-		 * anyway.)
+		 * If the same RTE is specified for more than one locking strength,
+		 * treat is as the strongest.  (Reasonable, since you can't take both a
+		 * shared and exclusive lock at the same time; it'll end up being
+		 * exclusive anyway.)
 		 *
 		 * We also consider that NOWAIT wins if it's specified both ways. This
 		 * is a bit more debatable but raising an error doesn't seem helpful.
@@ -2467,7 +2467,7 @@ applyLockingClause(Query *qry, Index rtindex,
 		 *
 		 * And of course pushedDown becomes false if any clause is explicit.
 		 */
-		rc->forUpdate |= forUpdate;
+		rc->strength = Max(rc->strength, strength);
 		rc->noWait |= noWait;
 		rc->pushedDown &= pushedDown;
 		return;
@@ -2476,7 +2476,7 @@ applyLockingClause(Query *qry, Index rtindex,
 	/* Make a new RowMarkClause */
 	rc = makeNode(RowMarkClause);
 	rc->rti = rtindex;
-	rc->forUpdate = forUpdate;
+	rc->strength = strength;
 	rc->noWait = noWait;
 	rc->pushedDown = pushedDown;
 	qry->rowMarks = lappend(qry->rowMarks, rc);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index c135465..1eb9962 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -8786,7 +8786,7 @@ for_locking_item:
 				{
 					LockingClause *n = makeNode(LockingClause);
 					n->lockedRels = $3;
-					n->forUpdate = TRUE;
+					n->strength = LCS_FORUPDATE;
 					n->noWait = $4;
 					$$ = (Node *) n;
 				}
@@ -8794,10 +8794,18 @@ for_locking_item:
 				{
 					LockingClause *n = makeNode(LockingClause);
 					n->lockedRels = $3;
-					n->forUpdate = FALSE;
+					n->strength = LCS_FORSHARE;
 					n->noWait = $4;
 					$$ = (Node *) n;
 				}
+			| FOR KEY SHARE locked_rels_list opt_nowait
+				{
+					LockingClause *n = makeNode(LockingClause);
+					n->lockedRels = $4;
+					n->strength = LCS_FORKEYSHARE;
+					n->noWait = $5;
+					$$ = (Node *) n;
+				}
 		;
 
 locked_rels_list:
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 3b31108..dc14a0d 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -55,7 +55,7 @@ static void rewriteValuesRTE(RangeTblEntry *rte, Relation target_relation,
 static void rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte,
 					Relation target_relation);
 static void markQueryForLocking(Query *qry, Node *jtnode,
-					bool forUpdate, bool noWait, bool pushedDown);
+					LockClauseStrength strength, bool noWait, bool pushedDown);
 static List *matchLocks(CmdType event, RuleLock *rulelocks,
 		   int varno, Query *parsetree);
 static Query *fireRIRrules(Query *parsetree, List *activeRIRs,
@@ -1401,8 +1401,8 @@ ApplyRetrieveRule(Query *parsetree,
 	rte->modifiedCols = NULL;
 
 	/*
-	 * If FOR UPDATE/SHARE of view, mark all the contained tables as implicit
-	 * FOR UPDATE/SHARE, the same as the parser would have done if the view's
+	 * If FOR UPDATE/SHARE/KEY SHARE of view, mark all the contained tables as implicit
+	 * FOR UPDATE/SHARE/KEY SHARE, the same as the parser would have done if the view's
 	 * subquery had been written out explicitly.
 	 *
 	 * Note: we don't consider forUpdatePushedDown here; such marks will be
@@ -1410,13 +1410,13 @@ ApplyRetrieveRule(Query *parsetree,
 	 */
 	if (rc != NULL)
 		markQueryForLocking(rule_action, (Node *) rule_action->jointree,
-							rc->forUpdate, rc->noWait, true);
+							rc->strength, rc->noWait, true);
 
 	return parsetree;
 }
 
 /*
- * Recursively mark all relations used by a view as FOR UPDATE/SHARE.
+ * Recursively mark all relations used by a view as FOR UPDATE/SHARE/KEY SHARE.
  *
  * This may generate an invalid query, eg if some sub-query uses an
  * aggregate.  We leave it to the planner to detect that.
@@ -1428,7 +1428,7 @@ ApplyRetrieveRule(Query *parsetree,
  */
 static void
 markQueryForLocking(Query *qry, Node *jtnode,
-					bool forUpdate, bool noWait, bool pushedDown)
+					LockClauseStrength strength, bool noWait, bool pushedDown)
 {
 	if (jtnode == NULL)
 		return;
@@ -1442,16 +1442,16 @@ markQueryForLocking(Query *qry, Node *jtnode,
 			/* ignore foreign tables */
 			if (rte->relkind != RELKIND_FOREIGN_TABLE)
 			{
-				applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
+				applyLockingClause(qry, rti, strength, noWait, pushedDown);
 				rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 			}
 		}
 		else if (rte->rtekind == RTE_SUBQUERY)
 		{
-			applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
-			/* FOR UPDATE/SHARE of subquery is propagated to subquery's rels */
+			applyLockingClause(qry, rti, strength, noWait, pushedDown);
+			/* FOR UPDATE/SHARE/KEY SHARE of subquery is propagated to subquery's rels */
 			markQueryForLocking(rte->subquery, (Node *) rte->subquery->jointree,
-								forUpdate, noWait, true);
+								strength, noWait, true);
 		}
 		/* other RTE types are unaffected by FOR UPDATE */
 	}
@@ -1461,14 +1461,14 @@ markQueryForLocking(Query *qry, Node *jtnode,
 		ListCell   *l;
 
 		foreach(l, f->fromlist)
-			markQueryForLocking(qry, lfirst(l), forUpdate, noWait, pushedDown);
+			markQueryForLocking(qry, lfirst(l), strength, noWait, pushedDown);
 	}
 	else if (IsA(jtnode, JoinExpr))
 	{
 		JoinExpr   *j = (JoinExpr *) jtnode;
 
-		markQueryForLocking(qry, j->larg, forUpdate, noWait, pushedDown);
-		markQueryForLocking(qry, j->rarg, forUpdate, noWait, pushedDown);
+		markQueryForLocking(qry, j->larg, strength, noWait, pushedDown);
+		markQueryForLocking(qry, j->rarg, strength, noWait, pushedDown);
 	}
 	else
 		elog(ERROR, "unrecognized node type: %d",
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 345f6f5..45b7c7b 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3869,9 +3869,10 @@ CheckForSerializableConflictOut(bool visible, Relation relation,
 		case HEAPTUPLE_RECENTLY_DEAD:
 			if (!visible)
 				return;
-			xid = HeapTupleHeaderGetXmax(tuple->t_data);
+			xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
 			break;
 		case HEAPTUPLE_DELETE_IN_PROGRESS:
+			Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 			xid = HeapTupleHeaderGetXmax(tuple->t_data);
 			break;
 		case HEAPTUPLE_INSERT_IN_PROGRESS:
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 5b06333..65f629b 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -130,7 +130,7 @@ CommandIsReadOnly(Node *parsetree)
 				if (stmt->intoClause != NULL)
 					return false;		/* SELECT INTO */
 				else if (stmt->rowMarks != NIL)
-					return false;		/* SELECT FOR UPDATE/SHARE */
+					return false;		/* SELECT FOR UPDATE/SHARE/KEY SHARE */
 				else if (stmt->hasModifyingCTE)
 					return false;		/* data-modifying CTE */
 				else
@@ -2147,10 +2147,21 @@ CreateCommandTag(Node *parsetree)
 						else if (stmt->rowMarks != NIL)
 						{
 							/* not 100% but probably close enough */
-							if (((PlanRowMark *) linitial(stmt->rowMarks))->markType == ROW_MARK_EXCLUSIVE)
-								tag = "SELECT FOR UPDATE";
-							else
-								tag = "SELECT FOR SHARE";
+							switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength)
+							{
+								case LCS_FORUPDATE:
+									tag = "SELECT FOR UPDATE";
+									break;
+								case LCS_FORSHARE:
+									tag = "SELECT FOR SHARE";
+									break;
+								case LCS_FORKEYSHARE:
+									tag = "SELECT FOR KEY SHARE";
+									break;
+								default:
+									tag =  "???";
+									break;
+							}
 						}
 						else
 							tag = "SELECT";
@@ -2197,10 +2208,21 @@ CreateCommandTag(Node *parsetree)
 						else if (stmt->rowMarks != NIL)
 						{
 							/* not 100% but probably close enough */
-							if (((RowMarkClause *) linitial(stmt->rowMarks))->forUpdate)
-								tag = "SELECT FOR UPDATE";
-							else
-								tag = "SELECT FOR SHARE";
+							switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength)
+							{
+								case LCS_FORUPDATE:
+									tag = "SELECT FOR UPDATE";
+									break;
+								case LCS_FORSHARE:
+									tag = "SELECT FOR SHARE";
+									break;
+								case LCS_FORKEYSHARE:
+									tag = "SELECT FOR KEY SHARE";
+									break;
+								default:
+									tag =  "???";
+									break;
+							}
 						}
 						else
 							tag = "SELECT";
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index 522a540..f4a4456 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -308,7 +308,7 @@ RI_FKey_check(PG_FUNCTION_ARGS)
 	 * Get the relation descriptors of the FK and PK tables.
 	 *
 	 * pk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = trigdata->tg_relation;
 	pk_rel = heap_open(riinfo.pk_relid, RowShareLock);
@@ -338,12 +338,12 @@ RI_FKey_check(PG_FUNCTION_ARGS)
 
 			/* ---------
 			 * The query string built is
-			 *	SELECT 1 FROM ONLY <pktable>
+			 *	SELECT 1 FROM ONLY <pktable> x FOR KEY SHARE OF x
 			 * ----------
 			 */
 			quoteRelationName(pkrelname, pk_rel);
 			snprintf(querystr, sizeof(querystr),
-					 "SELECT 1 FROM ONLY %s x FOR SHARE OF x",
+					 "SELECT 1 FROM ONLY %s x FOR KEY SHARE OF x",
 					 pkrelname);
 
 			/* Prepare and save the plan */
@@ -463,7 +463,8 @@ RI_FKey_check(PG_FUNCTION_ARGS)
 
 		/* ----------
 		 * The query string built is
-		 *	SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+		 *	SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+		 *	       FOR KEY SHARE OF x
 		 * The type id's for the $ parameters are those of the
 		 * corresponding FK attributes.
 		 * ----------
@@ -487,7 +488,7 @@ RI_FKey_check(PG_FUNCTION_ARGS)
 			querysep = "AND";
 			queryoids[i] = fk_type;
 		}
-		appendStringInfo(&querybuf, " FOR SHARE OF x");
+		appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 		/* Prepare and save the plan */
 		qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids,
@@ -625,7 +626,8 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
 
 		/* ----------
 		 * The query string built is
-		 *	SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+		 *	SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+		 *	       FOR KEY SHARE OF x
 		 * The type id's for the $ parameters are those of the
 		 * PK attributes themselves.
 		 * ----------
@@ -648,7 +650,7 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
 			querysep = "AND";
 			queryoids[i] = pk_type;
 		}
-		appendStringInfo(&querybuf, " FOR SHARE OF x");
+		appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 		/* Prepare and save the plan */
 		qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -712,7 +714,7 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS)
 	 * Get the relation descriptors of the FK and PK tables and the old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo.fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -780,7 +782,8 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS)
 
 				/* ----------
 				 * The query string built is
-				 *	SELECT 1 FROM ONLY <fktable> WHERE $1 = fkatt1 [AND ...]
+				 *	SELECT 1 FROM ONLY <fktable> x WHERE $1 = fkatt1 [AND ...]
+				 *	       FOR KEY SHARE OF x
 				 * The type id's for the $ parameters are those of the
 				 * corresponding PK attributes.
 				 * ----------
@@ -805,7 +808,7 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids,
@@ -890,7 +893,7 @@ RI_FKey_noaction_upd(PG_FUNCTION_ARGS)
 	 * old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo.fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -993,7 +996,7 @@ RI_FKey_noaction_upd(PG_FUNCTION_ARGS)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids,
@@ -1431,7 +1434,7 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS)
 	 * Get the relation descriptors of the FK and PK tables and the old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo.fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -1489,7 +1492,8 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS)
 
 				/* ----------
 				 * The query string built is
-				 *	SELECT 1 FROM ONLY <fktable> WHERE $1 = fkatt1 [AND ...]
+				 *	SELECT 1 FROM ONLY <fktable> x WHERE $1 = fkatt1 [AND ...]
+				 *	       FOR KEY SHARE OF x
 				 * The type id's for the $ parameters are those of the
 				 * corresponding PK attributes.
 				 * ----------
@@ -1514,7 +1518,7 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids,
@@ -1604,7 +1608,7 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS)
 	 * old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo.fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -1672,7 +1676,8 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS)
 
 				/* ----------
 				 * The query string built is
-				 *	SELECT 1 FROM ONLY <fktable> WHERE $1 = fkatt1 [AND ...]
+				 *	SELECT 1 FROM ONLY <fktable> x WHERE $1 = fkatt1 [AND ...]
+				 *	       FOR KEY SHARE OF x
 				 * The type id's for the $ parameters are those of the
 				 * corresponding PK attributes.
 				 * ----------
@@ -1697,7 +1702,7 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids,
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 75923a6..fa1e863 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -2863,7 +2863,7 @@ get_select_query_def(Query *query, deparse_context *context,
 			get_rule_expr(query->limitCount, context, false);
 	}
 
-	/* Add FOR UPDATE/SHARE clauses if present */
+	/* Add FOR UPDATE/SHARE/KEY SHARE clauses if present */
 	if (query->hasForUpdate)
 	{
 		foreach(l, query->rowMarks)
@@ -2875,12 +2875,24 @@ get_select_query_def(Query *query, deparse_context *context,
 			if (rc->pushedDown)
 				continue;
 
-			if (rc->forUpdate)
-				appendContextKeyword(context, " FOR UPDATE",
-									 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-			else
-				appendContextKeyword(context, " FOR SHARE",
-									 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+			switch (rc->strength)
+			{
+				case LCS_FORKEYSHARE:
+					appendContextKeyword(context, " FOR KEY SHARE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				case LCS_FORSHARE:
+					appendContextKeyword(context, " FOR SHARE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				case LCS_FORUPDATE:
+					appendContextKeyword(context, " FOR UPDATE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				default:
+					elog(ERROR, "unrecognized row locking clause %d", rc->strength);
+			}
+
 			appendStringInfo(buf, " OF %s",
 							 quote_identifier(rte->eref->aliasname));
 			if (rc->noWait)
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 603e4c1..0e8ef6f 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3624,6 +3624,9 @@ RelationGetIndexPredicate(Relation relation)
  * simple index keys, but attributes used in expressions and partial-index
  * predicates.)
  *
+ * If "keyAttrs" is true, only attributes that can be referenced by foreign
+ * keys are considered.
+ *
  * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
  * we can include system attributes (e.g., OID) in the bitmap representation.
  *
@@ -3635,16 +3638,17 @@ RelationGetIndexPredicate(Relation relation)
  * be bms_free'd when not needed anymore.
  */
 Bitmapset *
-RelationGetIndexAttrBitmap(Relation relation)
+RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 {
 	Bitmapset  *indexattrs;
+	Bitmapset  *uindexattrs;
 	List	   *indexoidlist;
 	ListCell   *l;
 	MemoryContext oldcxt;
 
 	/* Quick exit if we already computed the result. */
 	if (relation->rd_indexattr != NULL)
-		return bms_copy(relation->rd_indexattr);
+		return bms_copy(keyAttrs ? relation->rd_keyattr : relation->rd_indexattr);
 
 	/* Fast path if definitely no indexes */
 	if (!RelationGetForm(relation)->relhasindex)
@@ -3663,26 +3667,38 @@ RelationGetIndexAttrBitmap(Relation relation)
 	 * For each index, add referenced attributes to indexattrs.
 	 */
 	indexattrs = NULL;
+	uindexattrs = NULL;
 	foreach(l, indexoidlist)
 	{
 		Oid			indexOid = lfirst_oid(l);
 		Relation	indexDesc;
 		IndexInfo  *indexInfo;
 		int			i;
+		bool		isKey;
 
 		indexDesc = index_open(indexOid, AccessShareLock);
 
 		/* Extract index key information from the index's pg_index row */
 		indexInfo = BuildIndexInfo(indexDesc);
 
+		/* Can this index be referenced by a foreign key? */
+		isKey = indexInfo->ii_Unique &&
+				indexInfo->ii_Expressions == NIL &&
+				indexInfo->ii_Predicate == NIL;
+
 		/* Collect simple attribute references */
 		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 		{
 			int			attrnum = indexInfo->ii_KeyAttrNumbers[i];
 
 			if (attrnum != 0)
+			{
 				indexattrs = bms_add_member(indexattrs,
 							   attrnum - FirstLowInvalidHeapAttributeNumber);
+				if (isKey)
+					uindexattrs = bms_add_member(uindexattrs,
+												 attrnum - FirstLowInvalidHeapAttributeNumber);
+			}
 		}
 
 		/* Collect all attributes used in expressions, too */
@@ -3699,10 +3715,11 @@ RelationGetIndexAttrBitmap(Relation relation)
 	/* Now save a copy of the bitmap in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indexattr = bms_copy(indexattrs);
+	relation->rd_keyattr = bms_copy(uindexattrs);
 	MemoryContextSwitchTo(oldcxt);
 
 	/* We return our original working copy for caller to play with */
-	return indexattrs;
+	return keyAttrs ? uindexattrs : indexattrs;
 }
 
 /*
diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c
index d9b37b2..560a53d 100644
--- a/src/backend/utils/time/combocid.c
+++ b/src/backend/utils/time/combocid.c
@@ -118,9 +118,11 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup)
 {
 	CommandId	cid = HeapTupleHeaderGetRawCommandId(tup);
 
+	Assert(!(tup->t_infomask & HEAP_MOVED));
 	/* We do not store cmax when locking a tuple */
-	Assert(!(tup->t_infomask & (HEAP_MOVED | HEAP_IS_LOCKED)));
-	Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup)));
+	Assert(!HeapTupleHeaderIsLocked(tup));
+	Assert((tup->t_infomask & HEAP_XMAX_IS_MULTI) ||
+		   TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup)));
 
 	if (tup->t_infomask & HEAP_COMBOCID)
 		return GetRealCmax(cid);
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 1c4b74d..1e2d3fa 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -213,10 +213,23 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HeapTupleHeaderIsLocked(tuple))		/* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
 
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 			{
@@ -249,21 +262,34 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		return false;			/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HeapTupleHeaderIsLocked(tuple))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (TransactionIdIsCurrentTransactionId(xmax))
+			return false;
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+		{
+			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			return false;
+		}
 		return true;
 	}
 
 	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		return false;
 	}
@@ -281,7 +307,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -389,10 +415,23 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HeapTupleHeaderIsLocked(tuple))		/* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
 
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 			{
@@ -428,21 +467,39 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		return false;
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HeapTupleHeaderIsLocked(tuple))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
+				return true;	/* deleted after scan started */
+			else
+				return false;	/* deleted before scan started */
+		}
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+		{
+			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			return false;
+		}
 		return true;
 	}
 
 	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
 			return true;		/* deleted after scan started */
@@ -463,7 +520,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -636,10 +693,24 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HeapTupleMayBeUpdated;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HeapTupleHeaderIsLocked(tuple))		 /* not deleter */
 				return HeapTupleMayBeUpdated;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return HeapTupleMayBeUpdated;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return HeapTupleMayBeUpdated;
+				else
+					return HeapTupleSelfUpdated;
+				/* FIXME -- what do we need to do with the Cmax here? */
+			}
 
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 			{
@@ -675,27 +746,49 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return HeapTupleMayBeUpdated;
+		/* XXX might have XMAX_IS_MULTI ... */
 		return HeapTupleUpdated;	/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HeapTupleHeaderIsLocked(tuple))
+		{
+			if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+				return HeapTupleBeingUpdated;
+			return HeapTupleMayBeUpdated;
+		}
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= curcid)
+				return HeapTupleSelfUpdated;		/* updated after scan started */
+			else
+				return HeapTupleInvisible;	/* updated before scan started */
+		}
 
 		if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
 			return HeapTupleBeingUpdated;
-		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-					InvalidTransactionId);
+
+		if (TransactionIdDidCommit(xmax))
+		{
+			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			return HeapTupleUpdated;
+		}
+		/* it must have aborted or crashed */
 		return HeapTupleMayBeUpdated;
 	}
 
 	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
-			return HeapTupleMayBeUpdated;
+		if (HeapTupleHeaderIsLocked(tuple))
+			return HeapTupleMayBeUpdated;	/* FIXME might need rethinking */
 		if (HeapTupleHeaderGetCmax(tuple) >= curcid)
 			return HeapTupleSelfUpdated;		/* updated after scan started */
 		else
@@ -715,7 +808,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -802,10 +895,23 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HeapTupleHeaderIsLocked(tuple))		 /* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
 
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 			{
@@ -842,21 +948,37 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		return false;			/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HeapTupleHeaderIsLocked(tuple))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (TransactionIdIsCurrentTransactionId(xmax))
+			return false;
+		if (TransactionIdIsInProgress(xmax))
+		{
+			snapshot->xmax = xmax;
+			return true;
+		}
+		if (TransactionIdDidCommit(xmax))
+		{
+			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			return false;
+		}
 		return true;
 	}
 
 	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HeapTupleHeaderIsLocked(tuple))
 			return true;
 		return false;
 	}
@@ -877,7 +999,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -966,10 +1088,25 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HeapTupleHeaderIsLocked(tuple))		 /* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+					return true;	/* updated after scan started */
+				else
+					return false;	/* updated before scan started */
+			}
 
 			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
 			{
@@ -1008,13 +1145,34 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 	if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid or aborted */
 		return true;
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 		return true;
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HeapTupleHeaderIsLocked(tuple))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+				return true;	/* deleted after scan started */
+			else
+				return false;	/* deleted before scan started */
+		}
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+		{
+			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			/* updating transaction committed, but when? */
+			if (XidInMVCCSnapshot(xmax, snapshot))
+				return true;	/* treat as still in progress */
+			return false;
+		}
 		return true;
 	}
 
@@ -1121,8 +1279,9 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 		{
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
-			if (tuple->t_infomask & HEAP_IS_LOCKED)
+			if (HeapTupleHeaderIsLocked(tuple))
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
+			/* FIXME -- probably need something here */
 			/* inserted and then deleted by same xact */
 			return HEAPTUPLE_DELETE_IN_PROGRESS;
 		}
@@ -1153,7 +1312,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 	if (tuple->t_infomask & HEAP_XMAX_INVALID)
 		return HEAPTUPLE_LIVE;
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HeapTupleHeaderIsLocked(tuple))
 	{
 		/*
 		 * "Deleting" xact really only locked it, so the tuple is live in any
@@ -1177,6 +1336,10 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 			}
 
 			/*
+			 * FIXME -- if the multixact is gone, we should replace it with
+			 * the plain updating Xid and remove the IS_MULTI bit.
+			 */
+			/*
 			 * We don't really care whether xmax did commit, abort or crash.
 			 * We know that xmax did lock the tuple, but it did not and will
 			 * never actually update it.
@@ -1184,14 +1347,44 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 						InvalidTransactionId);
 		}
+
 		return HEAPTUPLE_LIVE;
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
-		return HEAPTUPLE_LIVE;
+		TransactionId xmax;
+
+		if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+			return HEAPTUPLE_LIVE;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
+		{
+			Assert(!TransactionIdIsInProgress(xmax));
+			Assert(!TransactionIdIsCurrentTransactionId(xmax));
+			if (TransactionIdDidCommit(xmax))
+				SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax);
+			else
+			{
+				/*
+				 * Not in Progress, Not Committed, so either Aborted or crashed
+				 */
+				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							InvalidTransactionId);
+				return HEAPTUPLE_LIVE;
+			}
+		}
+
+		/*
+		 * Deleter committed, but perhaps it was recent enough that some open
+		 * transactions could still see the tuple.
+		 */
+		if (!TransactionIdPrecedes(xmax, OldestXmin))
+			return HEAPTUPLE_RECENTLY_DEAD;
+
+		/* Otherwise, it's dead and removable */
+		return HEAPTUPLE_DEAD;
 	}
 
 	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 5b8ae88..4fdd6de 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -87,6 +87,7 @@ main(int argc, char *argv[])
 	Oid			set_oid = 0;
 	MultiXactId set_mxid = 0;
 	MultiXactOffset set_mxoff = (MultiXactOffset) -1;
+	TransactionId set_mxfreeze = FrozenTransactionId;
 	uint32		minXlogTli = 0,
 				minXlogId = 0,
 				minXlogSeg = 0;
@@ -116,7 +117,7 @@ main(int argc, char *argv[])
 	}
 
 
-	while ((c = getopt(argc, argv, "fl:m:no:O:x:e:")) != -1)
+	while ((c = getopt(argc, argv, "fl:m:no:O:x:e:z:")) != -1)
 	{
 		switch (c)
 		{
@@ -203,6 +204,23 @@ main(int argc, char *argv[])
 				}
 				break;
 
+			case 'z':
+				set_mxfreeze = strtoul(optarg, &endptr, 0);
+				if (endptr == optarg || *endptr != '\0')
+				{
+					fprintf(stderr, _("%s: invalid argument for option -z\n"), progname);
+					fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+					exit(1);
+				}
+				/* InvalidTransactionId is allowed here */
+				if (set_mxfreeze == FrozenTransactionId ||
+					set_mxfreeze == BootstrapTransactionId)
+				{
+					fprintf(stderr, _("%s: multitransaction freezeXid (-z) must not be 1 or 2\n"), progname);
+					exit(1);
+				}
+				break;
+
 			case 'l':
 				minXlogTli = strtoul(optarg, &endptr, 0);
 				if (endptr == optarg || *endptr != ',')
@@ -332,6 +350,11 @@ main(int argc, char *argv[])
 	if (set_mxoff != -1)
 		ControlFile.checkPointCopy.nextMultiOffset = set_mxoff;
 
+	/*
+	if (set_mxfreeze != -1)
+		ControlFile.checkPointCopy.mxactFreezeXid = set_mxfreeze;
+		*/
+
 	if (minXlogTli > ControlFile.checkPointCopy.ThisTimeLineID)
 		ControlFile.checkPointCopy.ThisTimeLineID = minXlogTli;
 
@@ -578,6 +601,10 @@ PrintControlValues(bool guessed)
 		   ControlFile.checkPointCopy.nextMulti);
 	printf(_("Latest checkpoint's NextMultiOffset:  %u\n"),
 		   ControlFile.checkPointCopy.nextMultiOffset);
+	/*
+	printf(_("Latest checkpoint's MultiXact freezeXid: %u\n"),
+		   ControlFile.checkPointCopy.mxactFreezeXid);
+		   */
 	printf(_("Latest checkpoint's oldestXID:        %u\n"),
 		   ControlFile.checkPointCopy.oldestXid);
 	printf(_("Latest checkpoint's oldestXID's DB:   %u\n"),
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 776ea5c..363c3bd 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -29,10 +29,22 @@
 
 typedef struct BulkInsertStateData *BulkInsertState;
 
+/*
+ * Possible lock modes for a tuple.
+ */
 typedef enum
 {
-	LockTupleShared,
-	LockTupleExclusive
+	/* SELECT FOR KEY SHARE */
+	LockTupleKeyShare,
+	/* SELECT FOR SHARE */
+	LockTupleShare,
+	/*
+	 * SELECT FOR UPDATE, and also plain UPDATE when the "key" columns are
+	 * not modified
+	 */
+	LockTupleUpdate,
+	/* other UPDATEs, and DELETE */
+	LockTupleKeyUpdate
 } LockTupleMode;
 
 
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index 966e2d0..c84cd7f 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -164,12 +164,15 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 #define HEAP_HASVARWIDTH		0x0002	/* has variable-width attribute(s) */
 #define HEAP_HASEXTERNAL		0x0004	/* has external stored attribute(s) */
 #define HEAP_HASOID				0x0008	/* has an object-id field */
-/* bit 0x0010 is available */
+#define HEAP_XMAX_KEYSHR_LOCK	0x0010	/* xmax is a key-shared locker */
 #define HEAP_COMBOCID			0x0020	/* t_cid is a combo cid */
 #define HEAP_XMAX_EXCL_LOCK		0x0040	/* xmax is exclusive locker */
-#define HEAP_XMAX_SHARED_LOCK	0x0080	/* xmax is shared locker */
-/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */
-#define HEAP_IS_LOCKED	(HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK)
+#define HEAP_XMAX_IS_NOT_UPDATE	0x0080	/* xmax, if valid, is only a locker.
+										 * Note this is not set unless
+										 * XMAX_IS_MULTI is also set. */
+
+#define HEAP_LOCK_BITS	(HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_IS_NOT_UPDATE | \
+						 HEAP_XMAX_KEYSHR_LOCK)
 #define HEAP_XMIN_COMMITTED		0x0100	/* t_xmin committed */
 #define HEAP_XMIN_INVALID		0x0200	/* t_xmin invalid/aborted */
 #define HEAP_XMAX_COMMITTED		0x0400	/* t_xmax committed */
@@ -187,14 +190,30 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 #define HEAP_XACT_MASK			0xFFE0	/* visibility-related bits */
 
 /*
+ * A tuple is only locked (i.e. not updated by its Xmax) if it the Xmax is not
+ * a multixact and it has either the EXCL_LOCK or KEYSHR_LOCK bits set, or if
+ * the xmax is a multi that doesn't contain an update.
+ *
+ * Beware of multiple evaluation of arguments.
+ */
+#define HeapTupleHeaderInfomaskIsLocked(infomask) \
+	((!((infomask) & HEAP_XMAX_IS_MULTI) && \
+	  (infomask) & (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)) || \
+	 (((infomask) & HEAP_XMAX_IS_MULTI) && ((infomask) & HEAP_XMAX_IS_NOT_UPDATE)))
+
+#define HeapTupleHeaderIsLocked(tup) \
+	HeapTupleHeaderInfomaskIsLocked((tup)->t_infomask)
+
+/*
  * information stored in t_infomask2:
  */
 #define HEAP_NATTS_MASK			0x07FF	/* 11 bits for number of attributes */
-/* bits 0x3800 are available */
+/* bits 0x1800 are available */
+#define HEAP_UPDATE_KEY_INTACT	0x2000	/* tuple updated, key cols untouched */
 #define HEAP_HOT_UPDATED		0x4000	/* tuple was HOT-updated */
 #define HEAP_ONLY_TUPLE			0x8000	/* this is heap-only tuple */
 
-#define HEAP2_XACT_MASK			0xC000	/* visibility-related bits */
+#define HEAP2_XACT_MASK			0xE000	/* visibility-related bits */
 
 /*
  * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins.  It is
@@ -221,6 +240,23 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 	(tup)->t_choice.t_heap.t_xmin = (xid) \
 )
 
+/*
+ * HeapTupleHeaderGetXmax gets you the raw Xmax field.  To find out the Xid
+ * that updated a tuple, you might need to resolve the MultiXactId if certain
+ * bits are set.  HeapTupleHeaderGetUpdateXid checks those bits and takes care
+ * to resolve the MultiXactId if necessary.  This might involve multixact I/O,
+ * so it should only be used if absolutely necessary.
+ */
+#define HeapTupleHeaderGetUpdateXid(tup) \
+( \
+	(!((tup)->t_infomask & HEAP_XMAX_INVALID) && \
+	 ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \
+	 !((tup)->t_infomask & HEAP_XMAX_IS_NOT_UPDATE)) ? \
+		HeapTupleGetUpdateXid(tup) \
+	: \
+		HeapTupleHeaderGetXmax(tup) \
+)
+
 #define HeapTupleHeaderGetXmax(tup) \
 ( \
 	(tup)->t_choice.t_heap.t_xmax \
@@ -721,16 +757,22 @@ typedef struct xl_heap_newpage
 
 #define SizeOfHeapNewpage	(offsetof(xl_heap_newpage, blkno) + sizeof(BlockNumber))
 
+/* flags for xl_heap_lock.infobits_set */
+#define XLHL_XMAX_IS_MULTI		0x01
+#define XLHL_XMAX_IS_NOT_UPDATE	0x02
+#define XLHL_XMAX_EXCL_LOCK		0x04
+#define XLHL_XMAX_KEYSHR_LOCK	0x08
+#define XLHL_UPDATE_KEY_INTACT	0x10
+
 /* This is what we need to know about lock */
 typedef struct xl_heap_lock
 {
 	xl_heaptid	target;			/* locked tuple id */
 	TransactionId locking_xid;	/* might be a MultiXactId not xid */
-	bool		xid_is_mxact;	/* is it? */
-	bool		shared_lock;	/* shared or exclusive row lock? */
+	int8		infobits_set;	/* infomask and infomask2 bits to set */
 } xl_heap_lock;
 
-#define SizeOfHeapLock	(offsetof(xl_heap_lock, shared_lock) + sizeof(bool))
+#define SizeOfHeapLock	(offsetof(xl_heap_lock, infobits_set) + sizeof(int8))
 
 /* This is what we need to know about in-place update */
 typedef struct xl_heap_inplace
@@ -768,8 +810,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
 extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
 extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup,
-						  CommandId *cmax,
-						  bool *iscombo);
+						  CommandId *cmax, bool *iscombo);
 
 /* ----------------
  *		fastgetattr
@@ -854,6 +895,9 @@ extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
 			heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \
 	)
 
+/* Prototype for HeapTupleHeader accessor in heapam.c */
+extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple);
+
 /* prototypes for functions in common/heaptuple.c */
 extern Size heap_compute_data_size(TupleDesc tupleDesc,
 					   Datum *values, bool *isnull);
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index c3ec763..ff255d7 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -13,8 +13,14 @@
 
 #include "access/xlog.h"
 
+
+/*
+ * The first two MultiXactId values are reserved to store the truncation Xid
+ * and epoch of the first segment, so we start assigning multixact values from
+ * 2.
+ */
 #define InvalidMultiXactId	((MultiXactId) 0)
-#define FirstMultiXactId	((MultiXactId) 1)
+#define FirstMultiXactId	((MultiXactId) 2)
 
 #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
 
@@ -22,6 +28,31 @@
 #define NUM_MXACTOFFSET_BUFFERS		8
 #define NUM_MXACTMEMBER_BUFFERS		16
 
+/*
+ * Possible multixact lock modes ("status").  The first three modes are for
+ * tuple locks (FOR KEY SHARE, FOR SHARE and FOR UPDATE, respectively); the
+ * fourth is used for an update that doesn't modify key columns.  The fifth one
+ * is used for other updates and deletes.  Note that we only use two bits to
+ * represent them on disk, which means we don't have space to represent the
+ * last one.  This is okay, because a multixact can never contain such an
+ * operation; this mode is only used to wait for other modes.
+ */
+typedef enum
+{
+	MultiXactStatusForKeyShare = 0x00,
+	MultiXactStatusForShare = 0x01,
+	MultiXactStatusForUpdate = 0x02,
+	MultiXactStatusUpdate = 0x03,
+	MultiXactStatusKeyUpdate = 0x04,
+} MultiXactStatus;
+
+typedef struct MultiXactMember
+{
+	TransactionId	xid;
+	MultiXactStatus	status;
+} MultiXactMember;
+
+
 /* ----------------
  *		multixact-related XLOG entries
  * ----------------
@@ -35,21 +66,27 @@ typedef struct xl_multixact_create
 {
 	MultiXactId mid;			/* new MultiXact's ID */
 	MultiXactOffset moff;		/* its starting offset in members file */
-	int32		nxids;			/* number of member XIDs */
-	TransactionId xids[1];		/* VARIABLE LENGTH ARRAY */
+	int32		nmembers;		/* number of member XIDs */
+	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } xl_multixact_create;
 
-#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids)
+#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, members)
 
 
-extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2);
-extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid);
+extern MultiXactId MultiXactIdCreateSingleton(TransactionId xid,
+						   MultiXactStatus status);
+extern MultiXactId MultiXactIdCreate(TransactionId xid1,
+				  MultiXactStatus status1, TransactionId xid2,
+				  MultiXactStatus status2);
+extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
+				  MultiXactStatus status);
 extern bool MultiXactIdIsRunning(MultiXactId multi);
-extern bool MultiXactIdIsCurrent(MultiXactId multi);
-extern void MultiXactIdWait(MultiXactId multi);
-extern bool ConditionalMultiXactIdWait(MultiXactId multi);
+extern void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+							int *remaining);
+extern bool ConditionalMultiXactIdWait(MultiXactId multi,
+						   MultiXactStatus status, int *remaining);
 extern void MultiXactIdSetOldestMember(void);
-extern int	GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids);
+extern int	GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **xids);
 
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
@@ -62,8 +99,12 @@ extern void StartupMultiXact(void);
 extern void ShutdownMultiXact(void);
 extern void MultiXactGetCheckptMulti(bool is_shutdown,
 						 MultiXactId *nextMulti,
-						 MultiXactOffset *nextMultiOffset);
+						 MultiXactOffset *nextMultiOffset,
+						 TransactionId *oldestTruncateXid,
+						 uint32 *oldestTruncateXidEpoch,
+						 MultiXactId *oldestMulti);
 extern void CheckPointMultiXact(void);
+extern void TruncateMultiXact(TransactionId oldestXid);
 extern void MultiXactSetNextMXact(MultiXactId nextMulti,
 					  MultiXactOffset nextMultiOffset);
 extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index cb43879..2e73233 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -71,7 +71,7 @@ typedef struct XLogContRecord
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD068	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD069	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 6688c19..f82295a 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -38,6 +38,10 @@ typedef struct CheckPoint
 	Oid			nextOid;		/* next free OID */
 	MultiXactId nextMulti;		/* next free MultiXactId */
 	MultiXactOffset nextMultiOffset;	/* next free MultiXact offset */
+	TransactionId oldestSegTruncateXid;	/* truncate xid of oldest multixact
+										 * offset segment */
+	uint32		oldestSegTruncateXidEpoch;	/* epoch of above xid */
+	MultiXactId oldestMultiXactId;	/* oldest MultiXactId still on disk */
 	TransactionId oldestXid;	/* cluster-wide minimum datfrozenxid */
 	Oid			oldestXidDB;	/* database with minimum datfrozenxid */
 	pg_time_t	time;			/* time stamp of checkpoint */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0a89f18..5167f09 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -404,9 +404,9 @@ typedef struct EState
 
 /*
  * ExecRowMark -
- *	   runtime representation of FOR UPDATE/SHARE clauses
+ *	   runtime representation of FOR UPDATE/SHARE/KEY SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we should have an
+ * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE/KEY SHARE, we should have an
  * ExecRowMark for each non-target relation in the query (except inheritance
  * parent RTEs, which can be ignored at runtime).  See PlanRowMark for details
  * about most of the fields.  In addition to fields directly derived from
@@ -427,7 +427,7 @@ typedef struct ExecRowMark
 
 /*
  * ExecAuxRowMark -
- *	   additional runtime representation of FOR UPDATE/SHARE clauses
+ *	   additional runtime representation of FOR UPDATE/SHARE/KEY SHARE clauses
  *
  * Each LockRows and ModifyTable node keeps a list of the rowmarks it needs to
  * deal with.  In addition to a pointer to the related entry in es_rowMarks,
@@ -1815,7 +1815,7 @@ typedef struct SetOpState
 /* ----------------
  *	 LockRowsState information
  *
- *		LockRows nodes are used to enforce FOR UPDATE/FOR SHARE locking.
+ *		LockRows nodes are used to enforce FOR UPDATE/SHARE/KEY SHARE locking.
  * ----------------
  */
 typedef struct LockRowsState
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index af6565e..1dc6202 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -74,7 +74,7 @@ typedef uint32 AclMode;			/* a bitmask of privilege bits */
 #define ACL_CONNECT		(1<<11) /* for databases */
 #define N_ACL_RIGHTS	12		/* 1 plus the last 1<<x */
 #define ACL_NO_RIGHTS	0
-/* Currently, SELECT ... FOR UPDATE/FOR SHARE requires UPDATE privileges */
+/* Currently, SELECT ... FOR UPDATE/SHARE/KEY SHARE requires UPDATE privileges */
 #define ACL_SELECT_FOR_UPDATE	ACL_UPDATE
 
 
@@ -119,7 +119,7 @@ typedef struct Query
 	bool		hasDistinctOn;	/* distinctClause is from DISTINCT ON */
 	bool		hasRecursive;	/* WITH RECURSIVE was specified */
 	bool		hasModifyingCTE;	/* has INSERT/UPDATE/DELETE in WITH */
-	bool		hasForUpdate;	/* FOR UPDATE or FOR SHARE was specified */
+	bool		hasForUpdate;	/* FOR UPDATE/SHARE/KEY SHARE was specified */
 
 	List	   *cteList;		/* WITH list (of CommonTableExpr's) */
 
@@ -570,18 +570,26 @@ typedef struct DefElem
 } DefElem;
 
 /*
- * LockingClause - raw representation of FOR UPDATE/SHARE options
+ * LockingClause - raw representation of FOR UPDATE/SHARE/KEY SHARE options
  *
  * Note: lockedRels == NIL means "all relations in query".	Otherwise it
  * is a list of RangeVar nodes.  (We use RangeVar mainly because it carries
  * a location field --- currently, parse analysis insists on unqualified
  * names in LockingClause.)
  */
+typedef enum LockClauseStrength
+{
+	/* order is important -- see applyLockingClause */
+	LCS_FORKEYSHARE,
+	LCS_FORSHARE,
+	LCS_FORUPDATE
+} LockClauseStrength;
+
 typedef struct LockingClause
 {
 	NodeTag		type;
-	List	   *lockedRels;		/* FOR UPDATE or FOR SHARE relations */
-	bool		forUpdate;		/* true = FOR UPDATE, false = FOR SHARE */
+	List	   *lockedRels;		/* FOR UPDATE, SHARE, KEY SHARE relations */
+	LockClauseStrength strength;
 	bool		noWait;			/* NOWAIT option */
 } LockingClause;
 
@@ -861,21 +869,21 @@ typedef struct WindowClause
 
 /*
  * RowMarkClause -
- *	   parser output representation of FOR UPDATE/SHARE clauses
+ *	   parser output representation of FOR UPDATE/SHARE/KEY SHARE clauses
  *
  * Query.rowMarks contains a separate RowMarkClause node for each relation
- * identified as a FOR UPDATE/SHARE target.  If FOR UPDATE/SHARE is applied
- * to a subquery, we generate RowMarkClauses for all normal and subquery rels
- * in the subquery, but they are marked pushedDown = true to distinguish them
- * from clauses that were explicitly written at this query level.  Also,
- * Query.hasForUpdate tells whether there were explicit FOR UPDATE/SHARE
- * clauses in the current query level.
+ * identified as a FOR UPDATE/SHARE/KEY SHARE target.  If one of these clauses
+ * is applied to a subquery, we generate RowMarkClauses for all normal and
+ * subquery rels in the subquery, but they are marked pushedDown = true to
+ * distinguish them from clauses that were explicitly written at this query
+ * level.  Also, Query.hasForUpdate tells whether there were explicit FOR
+ * UPDATE/SHARE/KEY SHARE clauses in the current query level.
  */
 typedef struct RowMarkClause
 {
 	NodeTag		type;
 	Index		rti;			/* range table index of target relation */
-	bool		forUpdate;		/* true = FOR UPDATE, false = FOR SHARE */
+	LockClauseStrength strength;
 	bool		noWait;			/* NOWAIT option */
 	bool		pushedDown;		/* pushed down from higher query level? */
 } RowMarkClause;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 6685864..27acfd9 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -744,7 +744,7 @@ typedef struct Limit
  * RowMarkType -
  *	  enums for types of row-marking operations
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we have to uniquely
+ * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE/KEY SHARE, we have to uniquely
  * identify all the source rows, not only those from the target relations, so
  * that we can perform EvalPlanQual rechecking at need.  For plain tables we
  * can just fetch the TID, the same as for a target relation.  Otherwise (for
@@ -756,19 +756,20 @@ typedef enum RowMarkType
 {
 	ROW_MARK_EXCLUSIVE,			/* obtain exclusive tuple lock */
 	ROW_MARK_SHARE,				/* obtain shared tuple lock */
+	ROW_MARK_KEYSHARE,			/* obtain keyshare tuple lock */
 	ROW_MARK_REFERENCE,			/* just fetch the TID */
 	ROW_MARK_COPY				/* physically copy the row value */
 } RowMarkType;
 
-#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_SHARE)
+#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_KEYSHARE)
 
 /*
  * PlanRowMark -
  *	   plan-time representation of FOR UPDATE/SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we create a separate
+ * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE/KEY LOCK, we create a separate
  * PlanRowMark node for each non-target relation in the query.	Relations that
- * are not specified as FOR UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
+ * are not specified as FOR UPDATE/SHARE/KEY LOCK are marked ROW_MARK_REFERENCE (if
  * real tables) or ROW_MARK_COPY (if not).
  *
  * Initially all PlanRowMarks have rti == prti and isParent == false.
diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h
index 88fc78b..dea10a0 100644
--- a/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@ -31,6 +31,6 @@ extern bool analyze_requires_snapshot(Node *parseTree);
 
 extern void CheckSelectLocking(Query *qry);
 extern void applyLockingClause(Query *qry, Index rtindex,
-				   bool forUpdate, bool noWait, bool pushedDown);
+				   LockClauseStrength strength, bool noWait, bool pushedDown);
 
 #endif   /* ANALYZE_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 173dc16..e3aca6c 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -103,6 +103,7 @@ typedef struct RelationData
 	Oid			rd_id;			/* relation's object id */
 	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
 	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
+	Bitmapset  *rd_keyattr;		/* cols that can be ref'd by foreign keys */
 	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
 	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
 	RuleLock   *rd_rules;		/* rewrite rules */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 9aaf969..5c6b27a 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -41,7 +41,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid	RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
-extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs);
 extern void RelationGetExclusionInfo(Relation indexRelation,
 						 Oid **operators,
 						 Oid **procs,
diff --git a/src/test/isolation/expected/fk-contention.out b/src/test/isolation/expected/fk-contention.out
index 24ed72d..0916f7f 100644
--- a/src/test/isolation/expected/fk-contention.out
+++ b/src/test/isolation/expected/fk-contention.out
@@ -7,9 +7,8 @@ step upd: UPDATE foo SET b = 'Hello World';
 
 starting permutation: ins upd com
 step ins: INSERT INTO bar VALUES (42);
-step upd: UPDATE foo SET b = 'Hello World'; <waiting ...>
+step upd: UPDATE foo SET b = 'Hello World';
 step com: COMMIT;
-step upd: <... completed>
 
 starting permutation: upd ins com
 step upd: UPDATE foo SET b = 'Hello World';
diff --git a/src/test/isolation/expected/fk-deadlock.out b/src/test/isolation/expected/fk-deadlock.out
index 36813f1..69a294a 100644
--- a/src/test/isolation/expected/fk-deadlock.out
+++ b/src/test/isolation/expected/fk-deadlock.out
@@ -11,57 +11,51 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1c: COMMIT;
-step s2i: <... completed>
 step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s2u s1u s2c s1c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
 step s2c: COMMIT;
 
 starting permutation: s2i s1i s2u s1u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2c: COMMIT;
-step s1i: <... completed>
 step s1u: UPDATE parent SET aux = 'bar';
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2.out b/src/test/isolation/expected/fk-deadlock2.out
index 2d8e5e5..41a818d 100644
--- a/src/test/isolation/expected/fk-deadlock2.out
+++ b/src/test/isolation/expected/fk-deadlock2.out
@@ -17,91 +17,79 @@ step s2u1: <... completed>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
-starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
-
-starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s2c: COMMIT;
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
-step s1u1: <... completed>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2_1.out b/src/test/isolation/expected/fk-deadlock2_1.out
index 30c4c99..3827348 100644
--- a/src/test/isolation/expected/fk-deadlock2_1.out
+++ b/src/test/isolation/expected/fk-deadlock2_1.out
@@ -19,92 +19,87 @@ step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
-
-starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s2c: COMMIT;
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2c: COMMIT;
-step s1u1: <... completed>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2_2.out b/src/test/isolation/expected/fk-deadlock2_2.out
new file mode 100644
index 0000000..b6be4b9
--- /dev/null
+++ b/src/test/isolation/expected/fk-deadlock2_2.out
@@ -0,0 +1,105 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1u1 s1u2 s1c s2u1 s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+
+starting permutation: s1u1 s1u2 s2u1 s1c s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s1c: COMMIT;
+step s2u1: <... completed>
+error in steps s1c s2u1: ERROR:  could not serialize access due to concurrent update
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  current transaction is aborted, commands ignored until end of transaction block
+step s2c: COMMIT;
+
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock3.out b/src/test/isolation/expected/fk-deadlock3.out
new file mode 100644
index 0000000..e69de29
diff --git a/src/test/isolation/expected/fk-deadlock_1.out b/src/test/isolation/expected/fk-deadlock_1.out
index ca75322..d648e48 100644
--- a/src/test/isolation/expected/fk-deadlock_1.out
+++ b/src/test/isolation/expected/fk-deadlock_1.out
@@ -11,61 +11,57 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1c: COMMIT;
-step s2i: <... completed>
-error in steps s1c s2i: ERROR:  could not serialize access due to concurrent update
 step s2u: UPDATE parent SET aux = 'baz';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s2u s1u s2c s1c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s2i s1i s2u s1u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2c: COMMIT;
-step s1i: <... completed>
-error in steps s2c s1i: ERROR:  could not serialize access due to concurrent update
 step s1u: UPDATE parent SET aux = 'bar';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock_2.out b/src/test/isolation/expected/fk-deadlock_2.out
new file mode 100644
index 0000000..2d3294e
--- /dev/null
+++ b/src/test/isolation/expected/fk-deadlock_2.out
@@ -0,0 +1,65 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1i s1u s1c s2i s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s1c s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s1u s2u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s2u s1u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s1i s1u s2u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s2i s1i s2u s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s1i s2c s1u s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
diff --git a/src/test/isolation/specs/fk-deadlock2.spec b/src/test/isolation/specs/fk-deadlock2.spec
index a8f1516..eefe187 100644
--- a/src/test/isolation/specs/fk-deadlock2.spec
+++ b/src/test/isolation/specs/fk-deadlock2.spec
@@ -42,18 +42,18 @@ permutation "s1u1" "s1u2" "s2u1" "s1c" "s2u2" "s2c"
 #permutation "s1u1" "s1u2" "s2u1" "s2u2" "s1c" "s2c"
 #permutation "s1u1" "s1u2" "s2u1" "s2u2" "s2c" "s1c"
 #permutation "s1u1" "s2u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c"
+#permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c"
 permutation "s1u1" "s2u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c"
+#permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c"
 permutation "s1u1" "s2u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c"
+permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c"
 #permutation "s2u1" "s1u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c"
+#permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c"
 permutation "s2u1" "s1u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c"
+#permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c"
 permutation "s2u1" "s1u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c"
+permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c"
 #permutation "s2u1" "s2u2" "s1u1" "s1u2" "s1c" "s2c"
-#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c"
+permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c"
 permutation "s2u1" "s2u2" "s1u1" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c"
+permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c"