From 0dc50e4e72963da772269873ca5e2abab2a70a2c Mon Sep 17 00:00:00 2001
From: Sokolov Yura aka funny_falcon <funny.falcon@gmail.com>
Date: Fri, 9 Mar 2018 22:49:01 +0300
Subject: [PATCH] Make hash table for xip in XidInMVCCSnapshot

When lot of concurrent transactions attempts to update single
row, then linear scan through running list in XidInMVCCSnapshot
became noticebale bottleneck.

With this change, hash table is built on first search of xid in
snapshot->xip and snapshot->subxip arrays.

If size of array is smaller than 60, then linear scan is still
used, cause there is no much benefits from building hash then.
(at least on Intel Xeon).
---
 src/backend/storage/ipc/procarray.c | 68 +++++++++++++++++++++++------
 src/backend/utils/time/snapmgr.c    | 29 +++++++++----
 src/backend/utils/time/tqual.c      | 85 ++++++++++++++++++++++++++++---------
 src/include/utils/snapshot.h        | 11 +++++
 4 files changed, 150 insertions(+), 43 deletions(-)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index afe1c03aa3..b3bd2deb6a 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1469,6 +1469,55 @@ GetMaxSnapshotSubxidCount(void)
 	return TOTAL_MAX_CACHED_SUBXIDS;
 }
 
+/*
+ * ExtendXipSizeForHash - calculate xip array size with space for hash table.
+ *
+ * Hash table should be at least twice larger than array to not depend on
+ * cleverness of algorithm.
+ *
+ * But if xipcnt < SnapshotMinHash, then no need in hash-table at all.
+ */
+int
+ExtendXipSizeForHash(int xipcnt, uint32* pmask)
+{
+	int size;
+	uint32 mask = 0;
+
+	size = xipcnt;
+	if (xipcnt >= SnapshotMinHash)
+	{
+		mask = xipcnt * 2;
+		mask |= mask>>1;
+		mask |= mask>>2;
+		mask |= mask>>4;
+		mask |= mask>>8;
+		mask |= mask>>16;
+		size += mask + 1;
+	}
+	*pmask = mask;
+	return size;
+}
+
+/*
+ * AllocateXip - allocate xip array, extended for hash part if needed.
+ *
+ * Hash part will be used in tqual.c in XidInMVCCSnapshot (XidInXip).
+ */
+static TransactionId *
+AllocateXip(int max, uint32* pmask)
+{
+	int size;
+	TransactionId *xip;
+
+	size = ExtendXipSizeForHash(max, pmask);
+	xip = (TransactionId *) malloc(size * sizeof(TransactionId));
+	if (xip == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	return xip;
+}
+
 /*
  * GetSnapshotData -- returns information about running transactions.
  *
@@ -1538,19 +1587,10 @@ GetSnapshotData(Snapshot snapshot)
 		 * First call for this snapshot. Snapshot is same size whether or not
 		 * we are in recovery, see later comments.
 		 */
-		snapshot->xip = (TransactionId *)
-			malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
-		if (snapshot->xip == NULL)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-		Assert(snapshot->subxip == NULL);
-		snapshot->subxip = (TransactionId *)
-			malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
-		if (snapshot->subxip == NULL)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		snapshot->xip = AllocateXip(GetMaxSnapshotXidCount(),
+				&snapshot->xhmask);
+		snapshot->subxip = AllocateXip(GetMaxSnapshotSubxidCount(),
+				&snapshot->subxhmask);
 	}
 
 	/*
@@ -1758,6 +1798,8 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->active_count = 0;
 	snapshot->regd_count = 0;
 	snapshot->copied = false;
+	snapshot->xhbuilt = false;
+	snapshot->subxhbuilt = false;
 
 	if (old_snapshot_threshold < 0)
 	{
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 4b45d3cccd..2b1c3fa2e0 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -662,14 +662,16 @@ CopySnapshot(Snapshot snapshot)
 	Snapshot	newsnap;
 	Size		subxipoff;
 	Size		size;
+	int			xcnt, subxcnt;
+	uint32		xhmask, subxhmask;
 
 	Assert(snapshot != InvalidSnapshot);
 
+	xcnt = ExtendXipSizeForHash(snapshot->xcnt, &xhmask);
+	subxcnt = ExtendXipSizeForHash(snapshot->subxcnt, &subxhmask);
 	/* We allocate any XID arrays needed in the same palloc block. */
-	size = subxipoff = sizeof(SnapshotData) +
-		snapshot->xcnt * sizeof(TransactionId);
-	if (snapshot->subxcnt > 0)
-		size += snapshot->subxcnt * sizeof(TransactionId);
+	size = subxipoff = sizeof(SnapshotData) + xcnt * sizeof(TransactionId);
+	size += subxcnt * sizeof(TransactionId);
 
 	newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
 	memcpy(newsnap, snapshot, sizeof(SnapshotData));
@@ -677,6 +679,10 @@ CopySnapshot(Snapshot snapshot)
 	newsnap->regd_count = 0;
 	newsnap->active_count = 0;
 	newsnap->copied = true;
+	newsnap->xhmask = xhmask;
+	newsnap->subxhmask = subxhmask;
+	newsnap->xhbuilt = false;
+	newsnap->subxhbuilt = false;
 
 	/* setup XID array */
 	if (snapshot->xcnt > 0)
@@ -2130,16 +2136,18 @@ RestoreSnapshot(char *start_address)
 	Size		size;
 	Snapshot	snapshot;
 	TransactionId *serialized_xids;
+	int			xcnt, subxcnt;
+	uint32		xhmask, subxhmask;
 
 	memcpy(&serialized_snapshot, start_address,
 		   sizeof(SerializedSnapshotData));
 	serialized_xids = (TransactionId *)
 		(start_address + sizeof(SerializedSnapshotData));
 
+	xcnt = ExtendXipSizeForHash(serialized_snapshot.xcnt, &xhmask);
+	subxcnt = ExtendXipSizeForHash(serialized_snapshot.subxcnt, &subxhmask);
 	/* We allocate any XID arrays needed in the same palloc block. */
-	size = sizeof(SnapshotData)
-		+ serialized_snapshot.xcnt * sizeof(TransactionId)
-		+ serialized_snapshot.subxcnt * sizeof(TransactionId);
+	size = sizeof(SnapshotData) + (xcnt + subxcnt) * sizeof(TransactionId);
 
 	/* Copy all required fields */
 	snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
@@ -2148,8 +2156,12 @@ RestoreSnapshot(char *start_address)
 	snapshot->xmax = serialized_snapshot.xmax;
 	snapshot->xip = NULL;
 	snapshot->xcnt = serialized_snapshot.xcnt;
+	snapshot->xhmask = xhmask;
+	snapshot->xhbuilt = false;
 	snapshot->subxip = NULL;
 	snapshot->subxcnt = serialized_snapshot.subxcnt;
+	snapshot->subxhmask = subxhmask;
+	snapshot->subxhbuilt = false;
 	snapshot->suboverflowed = serialized_snapshot.suboverflowed;
 	snapshot->takenDuringRecovery = serialized_snapshot.takenDuringRecovery;
 	snapshot->curcid = serialized_snapshot.curcid;
@@ -2167,8 +2179,7 @@ RestoreSnapshot(char *start_address)
 	/* Copy SubXIDs, if present. */
 	if (serialized_snapshot.subxcnt > 0)
 	{
-		snapshot->subxip = ((TransactionId *) (snapshot + 1)) +
-			serialized_snapshot.xcnt;
+		snapshot->subxip = ((TransactionId *) (snapshot + 1)) + xcnt;
 		memcpy(snapshot->subxip, serialized_xids + serialized_snapshot.xcnt,
 			   serialized_snapshot.subxcnt * sizeof(TransactionId));
 	}
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index f7c4c9188c..074c56e91e 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -1461,6 +1461,63 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin)
 	return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin);
 }
 
+/*
+ * XidInXip searches xid in xip array.
+ *
+ * If xhmask is zero, then simple linear scan of xip array used. Looks like
+ * there is no benifit in hash table for small xip array.
+ *
+ * If xhmask is non-zero, then space for hash table is allocated after xip
+ * array. *xhbuilt should be set if hash table is already built.
+ *
+ * Hash table is built using simple linear probing. It allows keep both
+ * insertion and lookup loop small.
+ */
+static inline bool
+XidInXip(TransactionId xid, TransactionId *xip, uint32 xcnt, uint32 xhmask, bool *xhbuilt)
+{
+	TransactionId *xiphash;
+	uint32 i, pos;
+	int found = 0;
+
+	if (xhmask == 0)
+	{
+		/* full scan for small snapshots and if xiphash is not allocated */
+		for (i = 0; i < xcnt; i++)
+			if (TransactionIdEquals(xid, xip[i]))
+				return true;
+		return false;
+	}
+
+	xiphash = xip + xcnt;
+
+	if (*xhbuilt)
+	{
+		/* Hash table already built. Do lookup */
+		pos = xid & xhmask;
+		while (xiphash[pos] != InvalidTransactionId)
+		{
+			if (TransactionIdEquals(xiphash[pos], xid))
+				return true;
+			pos = (pos + 1) & xhmask;
+		}
+		return false;
+	}
+
+	/* Build hash table. */
+	*xhbuilt = true;
+	memset(xiphash, 0, (xhmask+1) * sizeof(TransactionId));
+	for (i = 0; i < xcnt; i++)
+	{
+		found |= TransactionIdEquals(xip[i], xid);
+		pos = xip[i] & xhmask;
+		while (xiphash[pos] != InvalidTransactionId)
+			pos = (pos + 1) & xhmask;
+		xiphash[pos] = xip[i];
+	}
+	return found != 0;
+}
+
 /*
  * XidInMVCCSnapshot
  *		Is the given XID still-in-progress according to the snapshot?
@@ -1474,8 +1531,6 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin)
 bool
 XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
-	uint32		i;
-
 	/*
 	 * Make a quick range check to eliminate most XIDs without looking at the
 	 * xip arrays.  Note that this is OK even if we convert a subxact XID to
@@ -1507,13 +1562,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		if (!snapshot->suboverflowed)
 		{
 			/* we have full data, so search subxip */
-			int32		j;
-
-			for (j = 0; j < snapshot->subxcnt; j++)
-			{
-				if (TransactionIdEquals(xid, snapshot->subxip[j]))
-					return true;
-			}
+			if (XidInXip(xid, snapshot->subxip, snapshot->subxcnt, snapshot->subxhmask, &snapshot->subxhbuilt))
+				return true;
 
 			/* not there, fall through to search xip[] */
 		}
@@ -1534,16 +1584,12 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 				return false;
 		}
 
-		for (i = 0; i < snapshot->xcnt; i++)
-		{
-			if (TransactionIdEquals(xid, snapshot->xip[i]))
-				return true;
-		}
+
+		if (XidInXip(xid, snapshot->xip, snapshot->xcnt, snapshot->xhmask, &snapshot->xhbuilt))
+			return true;
 	}
 	else
 	{
-		int32		j;
-
 		/*
 		 * In recovery we store all xids in the subxact array because it is by
 		 * far the bigger array, and we mostly don't know which xids are
@@ -1573,11 +1619,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		 * indeterminate xid. We don't know whether it's top level or subxact
 		 * but it doesn't matter. If it's present, the xid is visible.
 		 */
-		for (j = 0; j < snapshot->subxcnt; j++)
-		{
-			if (TransactionIdEquals(xid, snapshot->subxip[j]))
-				return true;
-		}
+		if (XidInXip(xid, snapshot->subxip, snapshot->subxcnt, snapshot->subxhmask, &snapshot->subxhbuilt))
+			return true;
 	}
 
 	return false;
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index a8a5a8f4c0..87d1445cb3 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -23,6 +23,13 @@
 typedef struct SnapshotData *Snapshot;
 
 #define InvalidSnapshot		((Snapshot) NULL)
+/*
+ * Big running list will be converted into hash table on demand.
+ * SnapshotMinHash is threshold between "linear scan" and "hashtable on demand".
+ */
+#define SnapshotMinHash			(60)
+/* Calculate size to hold both array and hash table (if needed) */
+int ExtendXipSizeForHash(int xipcnt, uint32* plog);
 
 /*
  * We use SnapshotData structures to represent both "regular" (MVCC)
@@ -78,6 +85,8 @@ typedef struct SnapshotData
 	 */
 	TransactionId *xip;
 	uint32		xcnt;			/* # of xact ids in xip[] */
+	uint32		xhmask;			/* mask of allocated xip hash part */
+	bool		xhbuilt;		/* was hash table built */
 
 	/*
 	 * For non-historic MVCC snapshots, this contains subxact IDs that are in
@@ -90,6 +99,8 @@ typedef struct SnapshotData
 	 */
 	TransactionId *subxip;
 	int32		subxcnt;		/* # of xact ids in subxip[] */
+	uint32		subxhmask;		/* mask of allocated subxip hash part */
+	bool		subxhbuilt;		/* was hash table built */
 	bool		suboverflowed;	/* has the subxip array overflowed? */
 
 	bool		takenDuringRecovery;	/* recovery-shaped snapshot? */
-- 
2.14.1

