From 0fc050033563f5bef074401197d3b38425478857 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm@gmail.com>
Date: Wed, 2 Sep 2020 11:51:12 +0200
Subject: [PATCH] Update _bt_binsrch* to account for prefix column equality

This improves the search speed for multi-column search
 indexes, as the prefixes of hikey and lokey do not need
 to be compared when high key and low key both have N
 columns common with the searchkey.

Signed-off-by: Matthias van de Meent <boekewurm@gmail.com>
---
 src/backend/access/nbtree/nbtinsert.c |   9 ++
 src/backend/access/nbtree/nbtsearch.c | 164 +++++++++++++++++++-------
 src/backend/access/nbtree/nbtutils.c  |   1 +
 src/include/access/nbtree.h           |  20 ++++
 4 files changed, 153 insertions(+), 41 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index d36f7557c8..2bf6115040 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -208,6 +208,7 @@ search:
 			/* start over... */
 			if (stack)
 				_bt_freestack(stack);
+			itup_key->searchcolneq = 1;
 			goto search;
 		}
 
@@ -402,6 +403,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	bool		inposting = false;
 	bool		prevalldead = true;
 	int			curposti = 0;
+	int			searchcolneq = itup_key->searchcolneq;
 
 	/* Assume unique until we find a duplicate */
 	*is_unique = true;
@@ -559,6 +561,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 						if (nbuf != InvalidBuffer)
 							_bt_relbuf(rel, nbuf);
 						*is_unique = false;
+						/* Reset the search state to the state as we were called */
+						itup_key->searchcolneq = searchcolneq;
 						return InvalidTransactionId;
 					}
 
@@ -577,6 +581,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 						*speculativeToken = SnapshotDirty.speculativeToken;
 						/* Caller releases lock on buf immediately */
 						insertstate->bounds_valid = false;
+						/* Reset the search state to the state as we were called */
+						itup_key->searchcolneq = searchcolneq;
 						return xwait;
 					}
 
@@ -751,6 +757,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	if (nbuf != InvalidBuffer)
 		_bt_relbuf(rel, nbuf);
 
+	/* Reset the search state to the state as we were called */
+	itup_key->searchcolneq = searchcolneq;
+
 	return InvalidTransactionId;
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 1e628a33d7..aefc8733fd 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -45,6 +45,7 @@ static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
+static inline BTCompareResult _bt_compare_inline(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum, int scanStartCol);
 
 
 /*
@@ -348,8 +349,10 @@ _bt_binsrch(Relation rel,
 	BTPageOpaque opaque;
 	OffsetNumber low,
 				high;
-	int32		result,
-				cmpval;
+	int32		cmpval;
+	int			higheqcol,
+				loweqcol;
+	BTCompareResult result;
 
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -388,20 +391,33 @@ _bt_binsrch(Relation rel,
 
 	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
 
+	higheqcol = key->searchcolneq;
+	loweqcol = key->searchcolneq;
+
 	while (high > low)
 	{
 		OffsetNumber mid = low + ((high - low) / 2);
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare_inline(rel, key, page, mid, Min(higheqcol, loweqcol));
 
-		if (result >= cmpval)
+		Assert(result.compareResult == _bt_compare(rel, key, page, mid));
+
+		if (result.compareResult >= cmpval)
+		{
 			low = mid + 1;
+			loweqcol = result.comparedColumn;
+		}
 		else
+		{
 			high = mid;
+			higheqcol = result.comparedColumn;
+		}
 	}
 
+	key->searchcolneq = Min(loweqcol, higheqcol);
+
 	/*
 	 * At this point we have high == low, but be careful: they could point
 	 * past the last slot on the page.
@@ -452,8 +468,10 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	OffsetNumber low,
 				high,
 				stricthigh;
-	int32		result,
-				cmpval;
+	int32		cmpval;
+	int			higheqcol,
+				loweqcol;
+	BTCompareResult result;
 
 	page = BufferGetPage(insertstate->buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -500,6 +518,8 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	stricthigh = high;			/* high initially strictly higher */
 
 	cmpval = 1;					/* !nextkey comparison value */
+	loweqcol = key->searchcolneq;
+	higheqcol = key->searchcolneq;
 
 	while (high > low)
 	{
@@ -507,14 +527,20 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare_inline(rel, key, page, mid, Min(loweqcol, higheqcol));
+		Assert(result.compareResult == _bt_compare(rel, key, page, mid));
 
-		if (result >= cmpval)
+		if (result.compareResult >= cmpval)
+		{
 			low = mid + 1;
+			loweqcol = result.comparedColumn;
+		}
 		else
 		{
 			high = mid;
-			if (result != 0)
+			higheqcol = result.comparedColumn;
+
+			if (result.compareResult != 0)
 				stricthigh = high;
 		}
 
@@ -525,10 +551,12 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 		 * posting list when postingoff is set.  This should happen
 		 * infrequently.
 		 */
-		if (unlikely(result == 0 && key->scantid != NULL))
+		if (unlikely(result.compareResult == 0 && key->scantid != NULL))
 			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
 	}
 
+	key->searchcolneq = Min(loweqcol, higheqcol);
+
 	/*
 	 * On a leaf page, a binary search always returns the first key >= scan
 	 * key (at least in !nextkey case), which could be the last slot + 1. This
@@ -623,6 +651,36 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
  *			 0 if scankey == tuple at offnum;
  *			>0 if scankey > tuple at offnum.
  *
+ * See _bt_compare_inline() for more specific information.
+ *----------
+ */
+int32
+_bt_compare(Relation rel,
+			BTScanInsert key,
+			Page page,
+			OffsetNumber offnum)
+{
+	BTCompareResult result = _bt_compare_inline(rel, key, page, offnum, 1);
+	return result.compareResult;
+}
+
+/*----------
+ *	_bt_compare_inline() -- Compare insertion-type scankey to tuple on a page
+ *
+ *	page/offnum: location of btree item to be compared to.
+ *	scanStartCol: first column of key to start comparing. 1-based
+ *
+ *		This routine returns:
+ *
+ *		BTCompareResult.compareResult:
+ *			<0 if scankey < tuple at offnum;
+ *			 0 if scankey == tuple at offnum;
+ *			>0 if scankey > tuple at offnum.
+ *		BTCompareResult.comparedColumn:
+ *			if all columns are equal, the number of indexed
+ *			columns in the scankey + 1, otherwise the 1-indexed column
+ *			number that was compared to get the non-0 result.
+ *
  * NULLs in the keys are treated as sortable values.  Therefore
  * "equality" does not necessarily mean that the item should be returned
  * to the caller as a matching key.  Similarly, an insertion scankey
@@ -640,11 +698,12 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
  * key.  See backend/access/nbtree/README for details.
  *----------
  */
-int32
-_bt_compare(Relation rel,
-			BTScanInsert key,
-			Page page,
-			OffsetNumber offnum)
+inline static BTCompareResult
+_bt_compare_inline(Relation rel,
+				   BTScanInsert key,
+				   Page page,
+				   OffsetNumber offnum,
+				   int scanStartCol)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -653,18 +712,23 @@ _bt_compare(Relation rel,
 	ScanKey		scankey;
 	int			ncmpkey;
 	int			ntupatts;
-	int32		result;
+	BTCompareResult result;
 
 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
 	Assert(key->heapkeyspace || key->scantid == NULL);
+	Assert(scanStartCol >= 1);
 
 	/*
 	 * Force result ">" if target item is first data item on an internal page
 	 * --- see NOTE above.
 	 */
 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
-		return 1;
+	{
+		result.compareResult = 1;
+		result.comparedColumn = scanStartCol;
+		return result;
+	}
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	ntupatts = BTreeTupleGetNAtts(itup, rel);
@@ -684,8 +748,12 @@ _bt_compare(Relation rel,
 	ncmpkey = Min(ntupatts, key->keysz);
 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
 	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
-	scankey = key->scankeys;
-	for (int i = 1; i <= ncmpkey; i++)
+	Assert(scanStartCol <= ncmpkey + 1);
+
+	scankey = key->scankeys + (scanStartCol - 1);
+	result.comparedColumn = scanStartCol;
+
+	for (; result.comparedColumn <= ncmpkey; result.comparedColumn++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -695,18 +763,18 @@ _bt_compare(Relation rel,
 		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
 		{
 			if (isNull)
-				result = 0;		/* NULL "=" NULL */
+				result.compareResult = 0;		/* NULL "=" NULL */
 			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
-				result = -1;	/* NULL "<" NOT_NULL */
+				result.compareResult = -1;	/* NULL "<" NOT_NULL */
 			else
-				result = 1;		/* NULL ">" NOT_NULL */
+				result.compareResult = 1;		/* NULL ">" NOT_NULL */
 		}
 		else if (isNull)		/* key is NOT_NULL and item is NULL */
 		{
 			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
-				result = 1;		/* NOT_NULL ">" NULL */
+				result.compareResult = 1;		/* NOT_NULL ">" NULL */
 			else
-				result = -1;	/* NOT_NULL "<" NULL */
+				result.compareResult = -1;	/* NOT_NULL "<" NULL */
 		}
 		else
 		{
@@ -718,17 +786,17 @@ _bt_compare(Relation rel,
 			 * to flip the sign of the comparison result.  (Unless it's a DESC
 			 * column, in which case we *don't* flip the sign.)
 			 */
-			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
-													 scankey->sk_collation,
-													 datum,
-													 scankey->sk_argument));
+			result.compareResult = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+																   scankey->sk_collation,
+																   datum,
+																   scankey->sk_argument));
 
 			if (!(scankey->sk_flags & SK_BT_DESC))
-				INVERT_COMPARE_RESULT(result);
+				INVERT_COMPARE_RESULT(result.compareResult);
 		}
 
 		/* if the keys are unequal, return the difference */
-		if (result != 0)
+		if (result.compareResult != 0)
 			return result;
 
 		scankey++;
@@ -744,7 +812,10 @@ _bt_compare(Relation rel,
 	 * necessary.
 	 */
 	if (key->keysz > ntupatts)
-		return 1;
+	{
+		result.compareResult = 1;
+		return result;
+	}
 
 	/*
 	 * Use the heap TID attribute and scantid to try to break the tie.  The
@@ -789,10 +860,14 @@ _bt_compare(Relation rel,
 		 */
 		if (key->heapkeyspace && !key->pivotsearch &&
 			key->keysz == ntupatts && heapTid == NULL)
-			return 1;
+		{
+			result.compareResult = 1;
+			return result;
+		}
 
 		/* All provided scankey arguments found to be equal */
-		return 0;
+		result.compareResult = 0;
+		return result;
 	}
 
 	/*
@@ -801,7 +876,10 @@ _bt_compare(Relation rel,
 	 */
 	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
 	if (heapTid == NULL)
-		return 1;
+	{
+		result.compareResult = 1;
+		return result;
+	}
 
 	/*
 	 * Scankey must be treated as equal to a posting list tuple if its scantid
@@ -810,18 +888,19 @@ _bt_compare(Relation rel,
 	 * with scantid.
 	 */
 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
-	result = ItemPointerCompare(key->scantid, heapTid);
-	if (result <= 0 || !BTreeTupleIsPosting(itup))
+	result.compareResult = ItemPointerCompare(key->scantid, heapTid);
+	if (result.compareResult <= 0 || !BTreeTupleIsPosting(itup))
 		return result;
 	else
 	{
-		result = ItemPointerCompare(key->scantid,
-									BTreeTupleGetMaxHeapTID(itup));
-		if (result > 0)
-			return 1;
+		result.compareResult = ItemPointerCompare(key->scantid,
+												  BTreeTupleGetMaxHeapTID(itup));
+		if (result.compareResult > 0)
+			return result;
 	}
 
-	return 0;
+	result.compareResult = 0;
+	return result;
 }
 
 /*
@@ -1340,6 +1419,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	inskey.anynullkeys = false; /* unused */
 	inskey.nextkey = nextkey;
 	inskey.pivotsearch = false;
+	inskey.searchcolneq = 1;
 	inskey.scantid = NULL;
 	inskey.keysz = keysCount;
 
@@ -1375,6 +1455,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	_bt_initialize_more_data(so, dir);
 
+	/* reset the column search position for the following statement */
+	inskey.searchcolneq = 1;
 	/* position to the precise item on the page */
 	offnum = _bt_binsrch(rel, &inskey, buf);
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 81589b9056..5c058ca9a8 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -122,6 +122,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
+	key->searchcolneq = 1;
 	key->keysz = Min(indnkeyatts, tupnatts);
 	key->scantid = key->heapkeyspace && itup ?
 		BTreeTupleGetHeapTID(itup) : NULL;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 65d9698b89..9aebff05c0 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -639,6 +639,12 @@ typedef BTStackData *BTStack;
  * using a scankey built from a leaf page's high key.  Most callers set this
  * to false.
  *
+ * scancolneq contains an integer that is the first column number of the
+ * current index scan location that has not yet proven to equal the search
+ * key's columns. This helps by not repeatedly having to compare columns
+ * that we know are equal.
+ * This is incremented by _bt_search and _bt_search_insert
+ *
  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
  * is set to NULL when index scan doesn't need to find a position for a
  * specific physical tuple.  Must be set when inserting new tuples into
@@ -662,6 +668,7 @@ typedef struct BTScanInsertData
 	bool		nextkey;
 	bool		pivotsearch;
 	ItemPointer scantid;		/* tiebreaker for scankeys */
+	int			searchcolneq;	/* column that is not yet equal */
 	int			keysz;			/* Size of scankeys array */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
 } BTScanInsertData;
@@ -945,6 +952,19 @@ typedef struct BTScanOpaqueData
 
 typedef BTScanOpaqueData *BTScanOpaque;
 
+/*
+ * When two tuples are compared, we get a comparison result for one of the
+ * columns. While the result itself is the most interesting, the column
+ * can be used to further optimize searching in the index.
+ *
+ * See also: _bt_compare() and _bt_compare_inline()
+ */
+typedef struct BTCompareResult
+{
+	int32		compareResult; /* the value of the comparison between the first non-equal columns */
+	int			comparedColumn; /* the column number of the first non-equal column */
+} BTCompareResult;
+
 /*
  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
-- 
2.20.1

