Index range search optimization

Started by Konstantin Knizhnikover 2 years ago22 messages

knizhnik@garret.ru

over 2 years ago

1 attachment(s)

Hi hackers.

_bt_readpage performs key check for each item on the page trying to
locate upper boundary.
While comparison of simple integer keys are very fast, comparison of
long strings can be quite expensive.
We can first make check for the largest key on the page and if it is not
larger than upper boundary, then skip checks for all elements.

At this quite artificial example such optimization gives 3x time speed-up:

create table t(t text primary key);
insert into t values ('primary key-'||generate_series(1,10000000)::text);
select count(*) from t where t between 'primary key-1000000' and 'primary key-2000000';

At my notebook with large enough shared buffers and disabled concurrency
the difference is 83 vs. 247 msec
For integer keys the difference is much smaller: 69 vs. 82 msec

Certainly I realized that this example is quite exotic: most of DBAs
prefer integer keys and such large ranges are quite rare.
But still such large range queries are used.
And I have checked that the proposed patch doesn't cause slowdown of
exact search.

Attachments:

range-search.patchtext/x-patch; charset=UTF-8; name=range-search.patchDownload

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 263f75fce9..0f6a767409 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -888,7 +888,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * Examine the scan keys and eliminate any redundant keys; also mark the
 	 * keys that must be matched to continue the scan.
 	 */
-	_bt_preprocess_keys(scan);
+	_bt_preprocess_keys(scan, dir);
 
 	/*
 	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
@@ -1531,6 +1531,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	IndexTuple	itup;
+	bool		all_fit = false;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1584,6 +1586,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/* Do no perform this optimization for first index page to avoid slowdown of dot queries.
+	 * so->has_matches is set at the end if _bt_readpage, so we do not try to perform this optimization
+	 * at first invocation of _bt_readpage. Also it enforces that we found items mathcing low boundary o only upper boundary has to be checked.
+	 */
+	if (so->is_range_search	&& so->has_matches) {
+		bool		temp;
+		/* Try first to compare minnimal/maximal key on the page to avoid checks of all other items on the page */
+		itup = (IndexTuple) PageGetItem(page,  PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff));
+		all_fit = _bt_checkkeys(scan, itup, indnatts, dir, &temp);
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1594,7 +1607,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		while (offnum <= maxoff)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
-			IndexTuple	itup;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1608,7 +1620,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			if (all_fit || _bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1661,9 +1673,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (continuescan && !P_RIGHTMOST(opaque))
 		{
 			ItemId		iid = PageGetItemId(page, P_HIKEY);
-			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
 			int			truncatt;
 
+			itup = (IndexTuple) PageGetItem(page, iid);
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
 			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
 		}
@@ -1686,7 +1698,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		while (offnum >= minoff)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
-			IndexTuple	itup;
 			bool		tuple_alive;
 			bool		passes_quals;
 
@@ -1716,8 +1727,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+			passes_quals = all_fit || _bt_checkkeys(scan, itup, indnatts, dir,
+													&continuescan);
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
@@ -1771,8 +1782,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
 		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
 	}
-
-	return (so->currPos.firstItem <= so->currPos.lastItem);
+	return so->has_matches = so->currPos.firstItem <= so->currPos.lastItem;
 }
 
 /* Save an index item into so->currPos.items[itemIndex] */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 05abf36032..50701dc9d9 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -654,7 +654,7 @@ _bt_restore_array_keys(IndexScanDesc scan)
 	 */
 	if (changed)
 	{
-		_bt_preprocess_keys(scan);
+		_bt_preprocess_keys(scan, NoMovementScanDirection);
 		/* The mark should have been set on a consistent set of keys... */
 		Assert(so->qual_ok);
 	}
@@ -746,7 +746,7 @@ _bt_restore_array_keys(IndexScanDesc scan)
  * new elements of array keys.  Therefore we can't overwrite the source data.
  */
 void
-_bt_preprocess_keys(IndexScanDesc scan)
+_bt_preprocess_keys(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	int			numberOfKeys = scan->numberOfKeys;
@@ -761,9 +761,12 @@ _bt_preprocess_keys(IndexScanDesc scan)
 	int			i,
 				j;
 	AttrNumber	attno;
+	bool		low_boundary, high_boundary;
 
 	/* initialize result variables */
 	so->qual_ok = true;
+	so->is_range_search = true;
+	so->has_matches = false;
 	so->numberOfKeys = 0;
 
 	if (numberOfKeys < 1)
@@ -794,6 +797,11 @@ _bt_preprocess_keys(IndexScanDesc scan)
 		/* We can mark the qual as required if it's for first index col */
 		if (cur->sk_attno == 1)
 			_bt_mark_scankey_required(outkeys);
+
+		/* Check if we can perform range search optimization */
+		so->is_range_search =
+			(ScanDirectionIsForward(dir) & ((outkeys->sk_flags & (SK_ISNULL|SK_ROW_HEADER|SK_BT_REQFWD)) == SK_BT_REQFWD)) |
+			(ScanDirectionIsBackward(dir) & ((outkeys->sk_flags & (SK_ISNULL|SK_ROW_HEADER|SK_BT_REQBKWD)) == SK_BT_REQBKWD));
 		return;
 	}
 
@@ -1009,6 +1017,21 @@ _bt_preprocess_keys(IndexScanDesc scan)
 		}
 	}
 
+	/* Check if we can perform range search optimization */
+	if (new_numberOfKeys == 2)
+	{
+		low_boundary = false;
+		high_boundary = false;
+		for (i = 0; i < 2; i++)
+		{
+			ScanKey		key = &outkeys[i];
+			if ((key->sk_flags & (SK_ISNULL|SK_ROW_HEADER|SK_BT_REQFWD)) == SK_BT_REQFWD)
+				high_boundary = true;
+			else if ((key->sk_flags & (SK_ISNULL|SK_ROW_HEADER|SK_BT_REQBKWD)) == SK_BT_REQBKWD)
+				low_boundary = true;
+		}
+		so->is_range_search = low_boundary & high_boundary;
+	}
 	so->numberOfKeys = new_numberOfKeys;
 }
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d684786095..160f45f857 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1035,6 +1035,8 @@ typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
 	bool		qual_ok;		/* false if qual can never be satisfied */
+	bool		is_range_search;/* true if each is limited by upperbound */
+	bool		has_matches;    /* true if index scan already found some matches */
 	int			numberOfKeys;	/* number of preprocessed scan keys */
 	ScanKey		keyData;		/* array of preprocessed scan keys */
 
@@ -1250,7 +1252,7 @@ extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
 extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
-extern void _bt_preprocess_keys(IndexScanDesc scan);
+extern void _bt_preprocess_keys(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
 						  int tupnatts, ScanDirection dir, bool *continuescan);
 extern void _bt_killitems(IndexScanDesc scan);

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Konstantin Knizhnik (#1)

1 attachment(s)

Re: Index range search optimization

Hi!

On Fri, Jun 23, 2023 at 10:36 AM Konstantin Knizhnik <knizhnik@garret.ru>
wrote:

_bt_readpage performs key check for each item on the page trying to locate
upper boundary.
While comparison of simple integer keys are very fast, comparison of long
strings can be quite expensive.
We can first make check for the largest key on the page and if it is not
larger than upper boundary, then skip checks for all elements.

At this quite artificial example such optimization gives 3x time speed-up:

create table t(t text primary key);
insert into t values ('primary key-'||generate_series(1,10000000)::text);
select count(*) from t where t between 'primary key-1000000' and 'primary key-2000000';

At my notebook with large enough shared buffers and disabled concurrency
the difference is 83 vs. 247 msec
For integer keys the difference is much smaller: 69 vs. 82 msec

Certainly I realized that this example is quite exotic: most of DBAs
prefer integer keys and such large ranges are quite rare.
But still such large range queries are used.
And I have checked that the proposed patch doesn't cause slowdown of exact
search.

Neat optimization! But I wonder if we could do even better. The attached
patch allows Postgres to skip scan keys required for directional scans
(even when other keys are present in the scan). I'll soon post the testing
results and a more polished version of this patch.

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v1.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v1.patchDownload

From 158a75ab801587a618e06551daefdd40a320eee9 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 14 Sep 2023 13:18:09 +0300
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Reported-by:
Bug:
Discussion:
Author:
Reviewed-by:
Tested-by:
Backpatch-through:
---
 src/backend/access/nbtree/nbtsearch.c | 30 ++++++++++++++++++++++++---
 src/backend/access/nbtree/nbtutils.c  | 25 ++++++++++++----------
 src/include/access/nbtree.h           |  6 +++++-
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..cd72818f28f 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredDirMatched;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,26 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Preprocess scan keys required for direction scan with last item on the
+	 * page.  If those keys are matched, we can skip matching them to every
+	 * item.  Skip this for the first page in the scan to evade slowdown of
+	 * point queries.
+	 */
+	if (!so->firstPage)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir, &requiredDirMatched, false);
+	}
+	else
+	{
+		so->firstPage = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1616,7 +1638,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan,
+							  requiredDirMatched))
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1696,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan,
+						  requiredDirMatched);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1749,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredDirMatched);
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..696641fe5aa 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredDirMatched: indicates that scan keys required for direction scan
+ *					   are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredDirMatched)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,14 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredDir = false;
+
+		if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+			!(key->sk_flags & SK_ROW_MEMBER))
+			requiredDir = true;
+
+		if (requiredDir && requiredDirMatched)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1440,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
@@ -1498,11 +1505,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..66e6a7775bf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatched);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

Peter Geoghegan

pg@bowt.ie

over 2 years ago

In reply to: Alexander Korotkov (#2)

Re: Index range search optimization

On Thu, Sep 14, 2023 at 3:23 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

The attached patch allows Postgres to skip scan keys required for directional scans (even when other keys are present in the scan). I'll soon post the testing results and a more polished version of this patch.

This is very interesting to me, partly because it seems related to my
ongoing work on SAOP execution within nbtree.

My patch gives _bt_readpage and particularly _bt_checkkeys more
high-level context, which they use to intelligently control the scan.
That enables us to dynamically decide whether we should now perform
another descent of the index via another call to _bt_first, or if we
should prefer to continue on the leaf level for now. Maybe we will
match many distinct sets of array keys on the same leaf page, in the
same call to _bt_readpage. We don't want to miss out on such
opportunities, but we also want to quickly notice when we're on a page
where matching any more array keys is just hopeless.

There is a need to keep these two things in balance. We need to notice
the hopeless cases before wasting too many cycles on it. That creates
a practical need to do an early precheck of the high key (roughly the
same check that we do already). If the high key indicates that
continuing on this page is truly hopeless, then we should give up and
do another primitive index scan -- _bt_first will reposition us onto
the leaf page that we need to go to next, which is (hopefully) far
away from the leaf page we started on.

Your patch therefore has the potential to help my own patch. But, it
also has some potential to conflict with it, because my patch makes
the meaning of SK_BT_REQFWD and SK_BT_REQBKWD more complicated (though
only in cases where we have SK_SEARCHARRAY scan keys). I'm sure that
this can be managed sensibly, though.

Some feedback on your patch:

* I notice that you're not using the high key for this, even in a
forward scan -- you're using the last non-pivot tuple on the page. Why
is that? (I have some idea why, actually, but I'd like to hear your
thoughts first.)

* Separately, I don't think that it makes sense to use the same
requiredDirMatched value (which came from the last non-pivot tuple on
the page) when the special _bt_checkkeys call for the high key takes
place. I don't think that this will lead to wrong answers, but it's
weird, and is likely to defeat the existing optimization in some
important cases.

Due to the influence of suffix truncation, it's relatively likely that
the most significant column in the high key will be different to the
corresponding value from the last few non-pivot tuples on the page --
the high key tends to be "aligned with natural boundaries in the key
space", and so "gives us a good preview of the right sibling page". We
don't want to treat it the same as non-pivot tuples here, because it's
quite different, in ways that are subtle but still important.

* I would avoid using the terminology "preprocess scan keys" for this.
That exact terminology is already used to describe what
_bt_preprocess_keys() does.

That function is actually involved in Konstantin's patch, so that
could be very confusing. When we "preprocess" a scan key, it outputs a
processed scan key with markings such as the required markings that
you're using in the patch -- it's something that acts on/changes the
scan keys themselves. Whereas your patch is exploiting information
from already-processed scan keys, by applying it to the key space of a
page.

I suggest calling it "prechecking the page", or something like that. I
don't feel very strongly about what you call it, provided it isn't
confusing or ambiguous.

--
Peter Geoghegan

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Peter Geoghegan (#3)

1 attachment(s)

Re: Index range search optimization

Hi, Peter!

Thank you for your interest in this patch.

On Tue, Sep 19, 2023 at 1:48 AM Peter Geoghegan <pg@bowt.ie> wrote:

On Thu, Sep 14, 2023 at 3:23 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

The attached patch allows Postgres to skip scan keys required for directional scans (even when other keys are present in the scan). I'll soon post the testing results and a more polished version of this patch.

This is very interesting to me, partly because it seems related to my
ongoing work on SAOP execution within nbtree.

My patch gives _bt_readpage and particularly _bt_checkkeys more
high-level context, which they use to intelligently control the scan.
That enables us to dynamically decide whether we should now perform
another descent of the index via another call to _bt_first, or if we
should prefer to continue on the leaf level for now. Maybe we will
match many distinct sets of array keys on the same leaf page, in the
same call to _bt_readpage. We don't want to miss out on such
opportunities, but we also want to quickly notice when we're on a page
where matching any more array keys is just hopeless.

There is a need to keep these two things in balance. We need to notice
the hopeless cases before wasting too many cycles on it. That creates
a practical need to do an early precheck of the high key (roughly the
same check that we do already). If the high key indicates that
continuing on this page is truly hopeless, then we should give up and
do another primitive index scan -- _bt_first will reposition us onto
the leaf page that we need to go to next, which is (hopefully) far
away from the leaf page we started on.

This is a pretty neat optimization indeed!

Your patch therefore has the potential to help my own patch. But, it
also has some potential to conflict with it, because my patch makes
the meaning of SK_BT_REQFWD and SK_BT_REQBKWD more complicated (though
only in cases where we have SK_SEARCHARRAY scan keys). I'm sure that
this can be managed sensibly, though.

OK! Let me know if you feel that I need to change something in this
patch to lower the potential conflict.

Some feedback on your patch:

* I notice that you're not using the high key for this, even in a
forward scan -- you're using the last non-pivot tuple on the page. Why
is that? (I have some idea why, actually, but I'd like to hear your
thoughts first.)

I'm using the last non-pivot tuple on the page instead of hikey
because it's lower than hikey. As you stated below, the most
significant column in the hikey is likely different from that of the
last non-pivot tuple. So, it's more likely to use the optimization
with the last non-pivot tuple.

* Separately, I don't think that it makes sense to use the same
requiredDirMatched value (which came from the last non-pivot tuple on
the page) when the special _bt_checkkeys call for the high key takes
place. I don't think that this will lead to wrong answers, but it's
weird, and is likely to defeat the existing optimization in some
important cases.

Due to the influence of suffix truncation, it's relatively likely that
the most significant column in the high key will be different to the
corresponding value from the last few non-pivot tuples on the page --
the high key tends to be "aligned with natural boundaries in the key
space", and so "gives us a good preview of the right sibling page". We
don't want to treat it the same as non-pivot tuples here, because it's
quite different, in ways that are subtle but still important.

This definitely makes sense. I've removed the usage of
requiredDirMatched from this _bt_checkkeys() call.

* I would avoid using the terminology "preprocess scan keys" for this.
That exact terminology is already used to describe what
_bt_preprocess_keys() does.

That function is actually involved in Konstantin's patch, so that
could be very confusing. When we "preprocess" a scan key, it outputs a
processed scan key with markings such as the required markings that
you're using in the patch -- it's something that acts on/changes the
scan keys themselves. Whereas your patch is exploiting information
from already-processed scan keys, by applying it to the key space of a
page.

I suggest calling it "prechecking the page", or something like that. I
don't feel very strongly about what you call it, provided it isn't
confusing or ambiguous.

This also makes sense. I've rephrased the comment.

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v2.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v2.patchDownload

From 44609643f1f0c490bf71019ed4187adac079f4c9 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 14 Sep 2023 13:18:09 +0300
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Reported-by:
Bug:
Discussion:
Author:
Reviewed-by:
Tested-by:
Backpatch-through:
---
 src/backend/access/nbtree/nbtsearch.c | 29 ++++++++++++++++++++++++---
 src/backend/access/nbtree/nbtutils.c  | 25 +++++++++++++----------
 src/include/access/nbtree.h           |  6 +++++-
 3 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..1162765ab16 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredDirMatched;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,26 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  If
+	 * those keys are matched with the last item on the page, we can skip
+	 * matching them to every item.  Skip this for the first page in the scan
+	 * to evade slowdown of point queries.
+	 */
+	if (!so->firstPage)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir, &requiredDirMatched, false);
+	}
+	else
+	{
+		so->firstPage = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1616,7 +1638,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan,
+							  requiredDirMatched))
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1696,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1748,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredDirMatched);
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..696641fe5aa 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredDirMatched: indicates that scan keys required for direction scan
+ *					   are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredDirMatched)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,14 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredDir = false;
+
+		if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+			!(key->sk_flags & SK_ROW_MEMBER))
+			requiredDir = true;
+
+		if (requiredDir && requiredDirMatched)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1440,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
@@ -1498,11 +1505,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..66e6a7775bf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatched);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Alexander Korotkov (#4)

1 attachment(s)

Re: Index range search optimization

On Wed, Sep 20, 2023 at 5:07 PM Alexander Korotkov <aekorotkov@gmail.com> wrote:

On Tue, Sep 19, 2023 at 1:48 AM Peter Geoghegan <pg@bowt.ie> wrote:
This also makes sense. I've rephrased the comment.

The revised patch is attached. It contains better comments and the
commit message. Peter, could you please check if you're OK with this?

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v3.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v3.patchDownload

From e98287b05969e776236dad3ed5f16738842822b5 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 14 Sep 2023 13:18:09 +0300
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan
---
 src/backend/access/nbtree/nbtsearch.c | 40 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 31 +++++++++++++--------
 src/include/access/nbtree.h           |  6 +++-
 3 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..2a1d1505df3 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredDirMatched;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,37 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir, &requiredDirMatched, false);
+	}
+	else
+	{
+		so->firstPage = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1616,7 +1649,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan,
+							  requiredDirMatched))
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1707,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1759,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredDirMatched);
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..96c1674c82e 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredDirMatched: indicates that scan keys required for direction scan
+ *					   are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredDirMatched)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,20 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredDir = false;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+			!(key->sk_flags & SK_ROW_HEADER))
+			requiredDir = true;
+
+		if (requiredDir && requiredDirMatched)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1446,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
@@ -1498,11 +1511,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..66e6a7775bf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatched);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Alexander Korotkov (#5)

Re: Index range search optimization

On Thu, 21 Sept 2023 at 15:17, Alexander Korotkov <aekorotkov@gmail.com> wrote:

On Wed, Sep 20, 2023 at 5:07 PM Alexander Korotkov <aekorotkov@gmail.com> wrote:

On Tue, Sep 19, 2023 at 1:48 AM Peter Geoghegan <pg@bowt.ie> wrote:
This also makes sense. I've rephrased the comment.

The revised patch is attached. It contains better comments and the
commit message. Peter, could you please check if you're OK with this?

Hi, Alexander!

I looked at the patch code and I agree with this optimization.
Implementation also looks good to me except change :
+ if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+ !(key->sk_flags & SK_ROW_HEADER))
+ requiredDir = true;
...
- if ((key->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((key->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
+ if (requiredDir)
  *continuescan = false;

looks like changing behavior in the case when key->sk_flags &
SK_BT_REQFWD && (! ScanDirectionIsForward(dir)) &&
(!requiredDirMatched)
Originally it doesn't set *continuescan = false; and with the patch it will set.

This may be relevant for the first page when requiredDirMatched is
intentionally skipped to be set and for call
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);

Maybe I missed something and this can not appear for some reason?

Also naming of requiredDirMatched and requiredDir seems semantically
hard to understand the meaning without looking at the patch commit
message. But I don't have better proposals yet, so maybe it's
acceptable.

Kind regards,
Pavel Borisov
Supabase.

Peter Geoghegan

pg@bowt.ie

over 2 years ago

In reply to: Pavel Borisov (#6)

Re: Index range search optimization

On Thu, Sep 21, 2023 at 5:11 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

I looked at the patch code and I agree with this optimization.
Implementation also looks good to me except change :
+ if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+ !(key->sk_flags & SK_ROW_HEADER))
+ requiredDir = true;
...
- if ((key->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((key->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
+ if (requiredDir)
*continuescan = false;
looks like changing behavior in the case when key->sk_flags &
SK_BT_REQFWD && (! ScanDirectionIsForward(dir)) &&
(!requiredDirMatched)
Originally it doesn't set *continuescan = false; and with the patch it will set.

I agree that this is a problem. Inequality strategy scan keys are used
when the initial positioning strategy used by _bt_first (for its
_bt_search call) is based on an operator other than the "=" operator
for the opclass. These scan keys are required in one direction only
(Konstantin's original patch just focussed on these cases, actually).
Obviously, that difference matters. I don't think that this patch
should do anything that even looks like it might be revising the
formal definition of "required in the current scan direction".

Why is SK_ROW_HEADER treated as a special case by the patch? Could it
be related to the issues with required-ness and scan direction? Note
that we never use BTEqualStrategyNumber for SK_ROW_HEADER scan key row
comparisons, so they're only ever required for one scan direction.
(Equality-type row constructor syntax can of course be used without
preventing the system from using an index scan, but the nbtree code
will not see that case as a row comparison in the first place. This is
due to preprocessing by the planner -- nbtree just sees conventional
scan keys with multiple simple equality scan keys with = row
comparisons.)

Also, what about NULLs? While "key IS NULL" is classified as an
equality check (see _bt_preprocess_keys comments), the same isn't true
with "key IS NOT NULL". The latter case usually has scan key flags
"SK_ISNULL | SK_SEARCHNOTNULL | SK_BT_REQFWD" -- there is no
SK_BT_REQBKWD here.

This may be relevant for the first page when requiredDirMatched is
intentionally skipped to be set and for call
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);

Also, requiredDirMatched isn't initialized by _bt_readpage() when
"so->firstPage". Shouldn't it be initialized to false?

Also, don't we need to take more care with a fully empty page? The "if
(!so->firstPage) ... " block should be gated using a condition such as
"if (!so->firstPage && minoff < maxoff)". (Adding a "minoff <= maxoff"
test would also work, but then the optimization will get applied on
pages with only one non-pivot tuple. That would be harmless, but a
waste of cycles.)

Also naming of requiredDirMatched and requiredDir seems semantically
hard to understand the meaning without looking at the patch commit
message. But I don't have better proposals yet, so maybe it's
acceptable.

I agree. How about "requiredMatchedByPrecheck" instead of
"requiredDirMatched", and "required" instead of "requiredDir"?

It would be nice if this patch worked in a way that could be verified
by an assertion. Under this scheme, the optimization would only really
be used in release builds (builds without assertions enabled, really).
We'd only verify that the optimized case agreed with the slow path in
assert-enabled builds. It might also make sense to always "apply the
optimization" on assert-enabled builds, even for the first page seen
by _bt_readpage by any _bt_first-wise scan. Maybe this sort of
approach is impractical here for some reason, but I don't see why it
should be.

Obviously, the optimization should lower the amount of work in some
calls to _bt_checkkeys, without ever changing the answer _bt_checkkeys
gives. Ideally, it should be done in a way that makes that very
obvious. There are some very subtle interactions between _bt_checkkeys
and other, distant code -- which makes me feel paranoid. Notably, with
required equality strategy scan keys, we're crucially dependent on
_bt_first using an equality strategy for its initial positioning call
to _bt_search. This is described by comments in both _bt_checkkeys and
in _bt_first.

Note, in particular, that it is essential that the initial offnum
passed to _bt_readpage doesn't allow a call to _bt_checkkeys to take
place that could cause it to become confused by a required equality
strategy scan key, leading to _bt_checkkeys terminating the whole scan
"early" -- causing wrong answers. For a query "WHERE foo = 5" (and a
forward scan), we had better not pass _bt_readpage an offset number
for a tuple with "foo" value 4. If that is ever allowed then
_bt_checkkeys will terminate the scan immediately, leading to wrong
answers. All because _bt_checkkeys can't tell if 4 comes before 5 or
comes after 5 -- it only has an "=" operator to work with, so it can't
actually make this distinction, so it likes to assume that anything !=
5 must come after 5 (or before 5 during a backwards scan).

I added a very similar _bt_compare()-based assertion in
_bt_check_unique(), which went on to catch a very subtle bug in the
Postgres 12 nbtree work -- the bug fixed by commit 74eb2176bf. So I
have put this particular idea about asserting agreement between a fast
path and a slow comparison path into practice already.

--
Peter Geoghegan

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Peter Geoghegan (#7)

Re: Index range search optimization

On Fri, 22 Sept 2023 at 00:48, Peter Geoghegan <pg@bowt.ie> wrote:

On Thu, Sep 21, 2023 at 5:11 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:
I looked at the patch code and I agree with this optimization.
Implementation also looks good to me except change :
+ if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+ !(key->sk_flags & SK_ROW_HEADER))
+ requiredDir = true;
...
- if ((key->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((key->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
+ if (requiredDir)
*continuescan = false;
looks like changing behavior in the case when key->sk_flags &
SK_BT_REQFWD && (! ScanDirectionIsForward(dir)) &&
(!requiredDirMatched)
Originally it doesn't set *continuescan = false; and with the patch it will set.
I agree that this is a problem. Inequality strategy scan keys are used
when the initial positioning strategy used by _bt_first (for its
_bt_search call) is based on an operator other than the "=" operator
for the opclass. These scan keys are required in one direction only
(Konstantin's original patch just focussed on these cases, actually).
Obviously, that difference matters. I don't think that this patch
should do anything that even looks like it might be revising the
formal definition of "required in the current scan direction".

I think it's the simplification that changed code behavior - just an
overlook and this could be fixed easily.

Also, requiredDirMatched isn't initialized by _bt_readpage() when
"so->firstPage". Shouldn't it be initialized to false?

True.

Also naming of requiredDirMatched and requiredDir seems semantically
hard to understand the meaning without looking at the patch commit
message. But I don't have better proposals yet, so maybe it's
acceptable.

I agree. How about "requiredMatchedByPrecheck" instead of
"requiredDirMatched", and "required" instead of "requiredDir"?

For me, the main semantic meaning is omitted and even more unclear,
i.e. what exactly required and matched. I'd suppose scanDirRequired,
scanDirMatched, but feel it's not ideal either. Or maybe trySkipRange,
canSkipRange etc.

Regards,
Pavel Borisov,
Supabase.

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Peter Geoghegan (#7)

1 attachment(s)

Re: Index range search optimization

Hi Peter,
Hi Pavel,

The v4 of the patch is attached.

On Thu, Sep 21, 2023 at 11:48 PM Peter Geoghegan <pg@bowt.ie> wrote:

On Thu, Sep 21, 2023 at 5:11 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:
I looked at the patch code and I agree with this optimization.
Implementation also looks good to me except change :
+ if (key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD) &&
+ !(key->sk_flags & SK_ROW_HEADER))
+ requiredDir = true;
...
- if ((key->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((key->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
+ if (requiredDir)
*continuescan = false;
looks like changing behavior in the case when key->sk_flags &
SK_BT_REQFWD && (! ScanDirectionIsForward(dir)) &&
(!requiredDirMatched)
Originally it doesn't set *continuescan = false; and with the patch it will set.
I agree that this is a problem. Inequality strategy scan keys are used
when the initial positioning strategy used by _bt_first (for its
_bt_search call) is based on an operator other than the "=" operator
for the opclass. These scan keys are required in one direction only
(Konstantin's original patch just focussed on these cases, actually).
Obviously, that difference matters. I don't think that this patch
should do anything that even looks like it might be revising the
formal definition of "required in the current scan direction".

Sorry, that was messed up from various attempts to write the patch.
Actually, I end up with two boolean variables indicating whether the
current key is required for the same direction or opposite direction
scan. I believe that the key required for the opposite direction scan
should be already satisfied by _bt_first() except for NULLs case.
I've implemented a skip of calling the key function for this case
(with assert that result is the same).

Why is SK_ROW_HEADER treated as a special case by the patch? Could it
be related to the issues with required-ness and scan direction? Note
that we never use BTEqualStrategyNumber for SK_ROW_HEADER scan key row
comparisons, so they're only ever required for one scan direction.
(Equality-type row constructor syntax can of course be used without
preventing the system from using an index scan, but the nbtree code
will not see that case as a row comparison in the first place. This is
due to preprocessing by the planner -- nbtree just sees conventional
scan keys with multiple simple equality scan keys with = row
comparisons.)

The thing is that NULLs could appear in the middle of matching values.

# WITH t (a, b) AS (VALUES ('a', 'b'), ('a', NULL), ('b', 'a'))
SELECT a, b, (a, b) > ('a', 'a') FROM t ORDER BY (a, b);
a | b | ?column?
---+------+----------
a | b | t
a | NULL | NULL
b | a | t
(3 rows)

So we can't just skip the row comparison operator, because we can meet
NULL at any place.

This may be relevant for the first page when requiredDirMatched is
intentionally skipped to be set and for call
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);

Also, requiredDirMatched isn't initialized by _bt_readpage() when
"so->firstPage". Shouldn't it be initialized to false?

Also, don't we need to take more care with a fully empty page? The "if
(!so->firstPage) ... " block should be gated using a condition such as
"if (!so->firstPage && minoff < maxoff)". (Adding a "minoff <= maxoff"
test would also work, but then the optimization will get applied on
pages with only one non-pivot tuple. That would be harmless, but a
waste of cycles.)

This makes sense. I've added (minoff < maxoff) to the condition.

Also naming of requiredDirMatched and requiredDir seems semantically
hard to understand the meaning without looking at the patch commit
message. But I don't have better proposals yet, so maybe it's
acceptable.

I agree. How about "requiredMatchedByPrecheck" instead of
"requiredDirMatched", and "required" instead of "requiredDir"?

It would be nice if this patch worked in a way that could be verified
by an assertion. Under this scheme, the optimization would only really
be used in release builds (builds without assertions enabled, really).
We'd only verify that the optimized case agreed with the slow path in
assert-enabled builds. It might also make sense to always "apply the
optimization" on assert-enabled builds, even for the first page seen
by _bt_readpage by any _bt_first-wise scan. Maybe this sort of
approach is impractical here for some reason, but I don't see why it
should be.

Yes, this makes sense. I've added an assert check that results are
the same as with requiredMatchedByPrecheck == false.

Obviously, the optimization should lower the amount of work in some
calls to _bt_checkkeys, without ever changing the answer _bt_checkkeys
gives. Ideally, it should be done in a way that makes that very
obvious. There are some very subtle interactions between _bt_checkkeys
and other, distant code -- which makes me feel paranoid. Notably, with
required equality strategy scan keys, we're crucially dependent on
_bt_first using an equality strategy for its initial positioning call
to _bt_search. This is described by comments in both _bt_checkkeys and
in _bt_first.

Note, in particular, that it is essential that the initial offnum
passed to _bt_readpage doesn't allow a call to _bt_checkkeys to take
place that could cause it to become confused by a required equality
strategy scan key, leading to _bt_checkkeys terminating the whole scan
"early" -- causing wrong answers. For a query "WHERE foo = 5" (and a
forward scan), we had better not pass _bt_readpage an offset number
for a tuple with "foo" value 4. If that is ever allowed then
_bt_checkkeys will terminate the scan immediately, leading to wrong
answers. All because _bt_checkkeys can't tell if 4 comes before 5 or
comes after 5 -- it only has an "=" operator to work with, so it can't
actually make this distinction, so it likes to assume that anything !=
5 must come after 5 (or before 5 during a backwards scan).

I added a very similar _bt_compare()-based assertion in
_bt_check_unique(), which went on to catch a very subtle bug in the
Postgres 12 nbtree work -- the bug fixed by commit 74eb2176bf. So I
have put this particular idea about asserting agreement between a fast
path and a slow comparison path into practice already.

Good, thank you for the detailed clarification.

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v4.patchapplication/x-patch; name=0001-Skip-checking-of-scan-keys-required-for-direction-v4.patchDownload

From 5afb4c1181c8d017ad854e54a4a4c75348e03334 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 14 Sep 2023 13:18:09 +0300
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan
---
 src/backend/access/nbtree/nbtsearch.c | 61 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 ++++++++++++++++++++------
 src/include/access/nbtree.h           |  6 ++-
 3 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..b8435729d9f 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,38 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1603,6 +1637,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1616,7 +1651,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1719,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1771,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..a2eb578b0d2 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredDirMatched: indicates that scan keys required for direction scan
+ *					   are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1455,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1483,8 +1505,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1498,11 +1535,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..66e6a7775bf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatched);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

#10

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Alexander Korotkov (#9)

1 attachment(s)

Re: Index range search optimization

Hi, Alexander!

I found and fixed a couple of naming issues that came to v4 from
earlier patches.
Also, I added initialization of requiredMatchedByPrecheck in case of first page.

Please see patch v5.

One more doubt about naming. Calling function
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
ScanDirection dir, bool *continuescan, bool requiredMatchedByPrecheck)
as
(void) _bt_checkkeys(scan, itup, indnatts, dir,
&requiredMatchedByPrecheck, false);
looks little bit misleading because of coincidence of names of 5 and 6
arguments.

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v4.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v4.patchDownload

From 5afb4c1181c8d017ad854e54a4a4c75348e03334 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 14 Sep 2023 13:18:09 +0300
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan
---
 src/backend/access/nbtree/nbtsearch.c | 61 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 ++++++++++++++++++++------
 src/include/access/nbtree.h           |  6 ++-
 3 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..b8435729d9f 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,38 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1603,6 +1637,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1616,7 +1651,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1719,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1771,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..a2eb578b0d2 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredDirMatched: indicates that scan keys required for direction scan
+ *					   are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1455,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1483,8 +1505,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1498,11 +1535,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..66e6a7775bf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatched);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

#11

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Pavel Borisov (#10)

1 attachment(s)

Re: Index range search optimization

Sorry, I've mistaken with attached version previously. Correct v5 attached.

Show quoted text

On Mon, 25 Sept 2023 at 13:58, Pavel Borisov <pashkin.elfe@gmail.com> wrote:

Hi, Alexander!

I found and fixed a couple of naming issues that came to v4 from
earlier patches.
Also, I added initialization of requiredMatchedByPrecheck in case of first page.

Please see patch v5.

One more doubt about naming. Calling function
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
ScanDirection dir, bool *continuescan, bool requiredMatchedByPrecheck)
as
(void) _bt_checkkeys(scan, itup, indnatts, dir,
&requiredMatchedByPrecheck, false);
looks little bit misleading because of coincidence of names of 5 and 6
arguments.

Attachments:

v5-0001-PATCH-Skip-checking-of-scan-keys-required-for-dir.patchapplication/octet-stream; name=v5-0001-PATCH-Skip-checking-of-scan-keys-required-for-dir.patchDownload

From 59087260d132716f7d6882a5d28d68132437d52f Mon Sep 17 00:00:00 2001
From: Pavel Borisov <pashkin.elfe@gmail.com>
Date: Mon, 25 Sep 2023 13:45:22 +0400
Subject: [PATCH v5] [PATCH] Skip checking of scan keys required for
 directional scan in  B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan, Pavel Borisov
---
 src/backend/access/nbtree/nbtsearch.c | 62 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 +++++++++++++++++++------
 src/include/access/nbtree.h           |  6 ++-
 3 files changed, 110 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 3230b3b8940..128b58f9967 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1441,6 +1441,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1551,6 +1552,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1604,6 +1606,39 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+		requiredMatchedByPrecheck = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1615,6 +1650,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1628,7 +1664,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1685,7 +1732,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1737,7 +1784,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..2a9c25b03ed 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredMatchedByPrecheck: indicates that scan keys required for
+ * 							  direction scan are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1455,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1483,8 +1505,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1498,11 +1535,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8891fa79734..1b575ec40bc 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1254,7 +1257,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatchedByPrecheck);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.39.2 (Apple Git-143)

#12

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Pavel Borisov (#10)

1 attachment(s)

Re: Index range search optimization

On Mon, Sep 25, 2023 at 12:58 PM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

One more doubt about naming. Calling function
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
ScanDirection dir, bool *continuescan, bool requiredMatchedByPrecheck)
as
(void) _bt_checkkeys(scan, itup, indnatts, dir,
&requiredMatchedByPrecheck, false);
looks little bit misleading because of coincidence of names of 5 and 6
arguments.

I've added the comment clarifying this argument usage.

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v6.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v6.patchDownload

From 3455afb0a7c60e59379a8cb9104fdeef52a510f0 Mon Sep 17 00:00:00 2001
From: Pavel Borisov <pashkin.elfe@gmail.com>
Date: Mon, 25 Sep 2023 13:45:22 +0400
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan, Pavel Borisov
---
 src/backend/access/nbtree/nbtsearch.c | 69 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 ++++++++++++++++++-----
 src/include/access/nbtree.h           |  6 ++-
 3 files changed, 117 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..283190c7cf9 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,46 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+
+		/*
+		 * Do the precheck.  Note that we pass the pointer to
+		 * 'requiredMatchedByPrecheck' to 'continuescan' argument.  That will
+		 * set flag to true if all required keys are satisfied and false
+		 * otherwise.
+		 */
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+		requiredMatchedByPrecheck = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1603,6 +1645,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1616,7 +1659,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1727,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1779,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..2a9c25b03ed 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredMatchedByPrecheck: indicates that scan keys required for
+ * 							  direction scan are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1455,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1483,8 +1505,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1498,11 +1535,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..6ac0157f3fa 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag inficating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatchedByPrecheck);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

#13

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Alexander Korotkov (#12)

1 attachment(s)

Re: Index range search optimization

On Mon, Sep 25, 2023 at 1:18 PM Alexander Korotkov <aekorotkov@gmail.com>
wrote:

On Mon, Sep 25, 2023 at 12:58 PM Pavel Borisov <pashkin.elfe@gmail.com>
wrote:

One more doubt about naming. Calling function
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
ScanDirection dir, bool *continuescan, bool requiredMatchedByPrecheck)
as
(void) _bt_checkkeys(scan, itup, indnatts, dir,
&requiredMatchedByPrecheck, false);
looks little bit misleading because of coincidence of names of 5 and 6
arguments.

I've added the comment clarifying this argument usage.

Fixed typo inficating => indicating as pointed by Pavel.
Peter, what do you think about the current shape of the patch?

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v7.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v7.patchDownload

From b95d7881a5932b88d199511afad329b2f5c8147d Mon Sep 17 00:00:00 2001
From: Pavel Borisov <pashkin.elfe@gmail.com>
Date: Mon, 25 Sep 2023 13:45:22 +0400
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan, Pavel Borisov
---
 src/backend/access/nbtree/nbtsearch.c | 69 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 ++++++++++++++++++-----
 src/include/access/nbtree.h           |  6 ++-
 3 files changed, 117 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..283190c7cf9 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,46 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+
+		/*
+		 * Do the precheck.  Note that we pass the pointer to
+		 * 'requiredMatchedByPrecheck' to 'continuescan' argument.  That will
+		 * set flag to true if all required keys are satisfied and false
+		 * otherwise.
+		 */
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+		requiredMatchedByPrecheck = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1603,6 +1645,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1616,7 +1659,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1727,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1779,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7da499c4dd5..2a9c25b03ed 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1357,10 +1357,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredMatchedByPrecheck: indicates that scan keys required for
+ * 							  direction scan are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1381,6 +1384,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1429,11 +1455,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1483,8 +1505,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1498,11 +1535,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5c66964ca0..14893653588 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1054,6 +1054,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag indicating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1253,7 +1256,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatchedByPrecheck);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

#14

Peter Geoghegan

pg@bowt.ie

over 2 years ago

In reply to: Alexander Korotkov (#13)

Re: Index range search optimization

On Wed, Sep 27, 2023 at 9:41 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

Fixed typo inficating => indicating as pointed by Pavel.
Peter, what do you think about the current shape of the patch?

I'll try to get to this tomorrow. I'm rather busy with moving home at
the moment, unfortunately.

--
Peter Geoghegan

#15

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Peter Geoghegan (#14)

Re: Index range search optimization

On Thu, Sep 28, 2023 at 5:21 AM Peter Geoghegan <pg@bowt.ie> wrote:

On Wed, Sep 27, 2023 at 9:41 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

Fixed typo inficating => indicating as pointed by Pavel.
Peter, what do you think about the current shape of the patch?

I'll try to get to this tomorrow. I'm rather busy with moving home at
the moment, unfortunately.

No problem, thank you!

------
Regards,
Alexander Korotkov

#16

Peter Geoghegan

pg@bowt.ie

over 2 years ago

In reply to: Alexander Korotkov (#9)

Re: Index range search optimization

On Fri, Sep 22, 2023 at 7:24 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

The thing is that NULLs could appear in the middle of matching values.

# WITH t (a, b) AS (VALUES ('a', 'b'), ('a', NULL), ('b', 'a'))
SELECT a, b, (a, b) > ('a', 'a') FROM t ORDER BY (a, b);
a | b | ?column?
---+------+----------
a | b | t
a | NULL | NULL
b | a | t
(3 rows)

So we can't just skip the row comparison operator, because we can meet
NULL at any place.

But why would SK_ROW_HEADER be any different? Is it related to this
existing case inside _bt_check_rowcompare()?:

if (subkey->sk_flags & SK_ISNULL)
{
/*
* Unlike the simple-scankey case, this isn't a disallowed case.
* But it can never match. If all the earlier row comparison
* columns are required for the scan direction, we can stop the
* scan, because there can't be another tuple that will succeed.
*/
if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
subkey--;
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
return false;
}

I noticed that you're not initializing so->firstPage correctly for the
_bt_endpoint() path, which is used when the initial position of the
scan is either the leftmost or rightmost page. That is, it's possible
to reach _bt_readpage() without having reached the point in
_bt_first() where you initialize so->firstPage to "true".

It would probably make sense if the flag was initialized to "false" in
the same way as most other scan state is already, somewhere in
nbtree.c. Probably in btrescan().

--
Peter Geoghegan

#17

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Peter Geoghegan (#16)

1 attachment(s)

Re: Index range search optimization

Hi, Peter.

On Fri, Sep 29, 2023 at 4:57 AM Peter Geoghegan <pg@bowt.ie> wrote:

On Fri, Sep 22, 2023 at 7:24 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

The thing is that NULLs could appear in the middle of matching values.

# WITH t (a, b) AS (VALUES ('a', 'b'), ('a', NULL), ('b', 'a'))
SELECT a, b, (a, b) > ('a', 'a') FROM t ORDER BY (a, b);
a | b | ?column?
---+------+----------
a | b | t
a | NULL | NULL
b | a | t
(3 rows)

So we can't just skip the row comparison operator, because we can meet
NULL at any place.

But why would SK_ROW_HEADER be any different? Is it related to this
existing case inside _bt_check_rowcompare()?:

if (subkey->sk_flags & SK_ISNULL)
{
/*
* Unlike the simple-scankey case, this isn't a disallowed case.
* But it can never match. If all the earlier row comparison
* columns are required for the scan direction, we can stop the
* scan, because there can't be another tuple that will succeed.
*/
if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
subkey--;
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
return false;
}

Yes, exactly. Our row comparison operators don't match if there is any
null inside the row. But you can find these rows within the matching
range.

I noticed that you're not initializing so->firstPage correctly for the
_bt_endpoint() path, which is used when the initial position of the
scan is either the leftmost or rightmost page. That is, it's possible
to reach _bt_readpage() without having reached the point in
_bt_first() where you initialize so->firstPage to "true".

Good catch, thank you!

It would probably make sense if the flag was initialized to "false" in
the same way as most other scan state is already, somewhere in
nbtree.c. Probably in btrescan().

Makes sense, initialisation is added.

------
Regards,
Alexander Korotkov

Attachments:

0001-Skip-checking-of-scan-keys-required-for-direction-v8.patchapplication/octet-stream; name=0001-Skip-checking-of-scan-keys-required-for-direction-v8.patchDownload

From 95d912a172993bc59a486030d4c6c1577a6a66fd Mon Sep 17 00:00:00 2001
From: Pavel Borisov <pashkin.elfe@gmail.com>
Date: Mon, 25 Sep 2023 13:45:22 +0400
Subject: [PATCH] Skip checking of scan keys required for directional scan in
 B-tree

Currently, B-tree code matches every scan key to every item on the page.
Imagine the ordered B-tree scan for the query like this.

SELECT * FROM tbl WHERE col > 'a' AND col < 'b' ORDER BY col;

The (col > 'a') scan key will be always matched once we find the location to
start the scan.  The (col < 'b') scan key will match every item on the page
as long as it matches the last item on the page.

This patch implements prechecking of the scan keys required for directional
scan on beginning of page scan.  If precheck is successful we can skip this
scan keys check for the items on the page.  That could lead to significant
acceleration especially if the comparison operator is expensive.

Idea from patch by Konstantin Knizhnik.

Discussion: https://postgr.es/m/079c3f8e-3371-abe2-e93c-fc8a0ae3f571%40garret.ru
Reviewed-by: Peter Geoghegan, Pavel Borisov
---
 src/backend/access/nbtree/nbtree.c    |  1 +
 src/backend/access/nbtree/nbtsearch.c | 70 +++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  | 59 +++++++++++++++++-----
 src/include/access/nbtree.h           |  6 ++-
 4 files changed, 119 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 6c5b5c69ce5..92950b37767 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -407,6 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 
 	so->markItemIndex = -1;
 	so->arrayKeyCount = 0;
+	so->firstPage = false;
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 17ad89749d5..c47eaed0e98 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1429,6 +1429,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* remember which buffer we have pinned, if any */
 	Assert(!BTScanPosIsValid(so->currPos));
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	/*
 	 * Now load data from the first page of the scan.
@@ -1539,6 +1540,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		requiredMatchedByPrecheck;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1592,6 +1594,46 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Prechecking the page with scan keys required for direction scan.  We
+	 * check these keys with the last item on the page (according to our scan
+	 * direction).  If these keys are matched, we can skip checking them with
+	 * every item on the page.  Scan keys for our scan direction would
+	 * necessarily match the previous items.  Scan keys required for opposite
+	 * direction scan are already matched by the _bt_first() call.
+	 *
+	 * With the forward scan, we do this check for the last item on the page
+	 * instead of the high key.  It's relatively likely that the most
+	 * significant column in the high key will be different from the
+	 * corresponding value from the last item on the page.  So checking with
+	 * the last item on the page would give a more precise answer.
+	 *
+	 * We skip this for the first page in the scan to evade the possible
+	 * slowdown of the point queries.
+	 */
+	if (!so->firstPage && minoff < maxoff)
+	{
+		ItemId		iid;
+		IndexTuple	itup;
+
+		iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
+		itup = (IndexTuple) PageGetItem(page, iid);
+
+		/*
+		 * Do the precheck.  Note that we pass the pointer to
+		 * 'requiredMatchedByPrecheck' to 'continuescan' argument.  That will
+		 * set flag to true if all required keys are satisfied and false
+		 * otherwise.
+		 */
+		(void) _bt_checkkeys(scan, itup, indnatts, dir,
+							 &requiredMatchedByPrecheck, false);
+	}
+	else
+	{
+		so->firstPage = false;
+		requiredMatchedByPrecheck = false;
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1603,6 +1645,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	itup;
+			bool		passes_quals;
 
 			/*
 			 * If the scan specifies not to return killed tuples, then we
@@ -1616,7 +1659,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			itup = (IndexTuple) PageGetItem(page, iid);
 
-			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
+			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
+			if (passes_quals)
 			{
 				/* tuple passes all scan key conditions */
 				if (!BTreeTupleIsPosting(itup))
@@ -1673,7 +1727,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			int			truncatt;
 
 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
+			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false);
 		}
 
 		if (!continuescan)
@@ -1725,7 +1779,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			itup = (IndexTuple) PageGetItem(page, iid);
 
 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-										 &continuescan);
+										 &continuescan, requiredMatchedByPrecheck);
+
+			/*
+			 * If the result of prechecking required keys was true, then in
+			 * assert-enabled builds we also recheck that _bt_checkkeys()
+			 * result is is the same.
+			 */
+			Assert(!requiredMatchedByPrecheck ||
+				   passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
+												 &continuescan, false));
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions */
@@ -2443,6 +2506,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 
 	/* remember which buffer we have pinned */
 	so->currPos.buf = buf;
+	so->firstPage = true;
 
 	_bt_initialize_more_data(so, dir);
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index e4528db4779..8e9c892f78a 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1372,10 +1372,13 @@ _bt_mark_scankey_required(ScanKey skey)
  * tupnatts: number of attributes in tupnatts (high key may be truncated)
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
+ * requiredMatchedByPrecheck: indicates that scan keys required for
+ * 							  direction scan are already matched
  */
 bool
 _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-			  ScanDirection dir, bool *continuescan)
+			  ScanDirection dir, bool *continuescan,
+			  bool requiredMatchedByPrecheck)
 {
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
@@ -1396,6 +1399,29 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 		Datum		datum;
 		bool		isNull;
 		Datum		test;
+		bool		requiredSameDir = false,
+					requiredOppositeDir = false;
+
+		/*
+		 * Check if the key is required for ordered scan in the same or
+		 * opposite direction.  Save as flag variables for future usage.
+		 */
+		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+			requiredSameDir = true;
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
+			requiredOppositeDir = true;
+
+		/*
+		 * Is the key required for scanning for either forward or backward
+		 * direction?  If so and called told us that these types of keys are
+		 * known to be matched, skip the check.  Except for the row keys,
+		 * where NULLs could be found in the middle of matching values.
+		 */
+		if ((requiredSameDir || requiredOppositeDir) &&
+			!(key->sk_flags & SK_ROW_HEADER) && requiredMatchedByPrecheck)
+			continue;
 
 		if (key->sk_attno > tupnatts)
 		{
@@ -1444,11 +1470,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * scan direction, then we can conclude no further tuples will
 			 * pass, either.
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
@@ -1498,8 +1520,23 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-								 datum, key->sk_argument);
+		/*
+		 * Apply the key checking function.  When the key is required for
+		 * opposite direction scan, it must be already satisfied by
+		 * _bt_first() except for the NULLs checking, which have already done
+		 * above.
+		 */
+		if (!requiredOppositeDir)
+		{
+			test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
+									 datum, key->sk_argument);
+		}
+		else
+		{
+			test = true;
+			Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
+											 datum, key->sk_argument));
+		}
 
 		if (!DatumGetBool(test))
 		{
@@ -1513,11 +1550,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * initial positioning in _bt_first() when they are available. See
 			 * comments in _bt_first().
 			 */
-			if ((key->sk_flags & SK_BT_REQFWD) &&
-				ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((key->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
+			if (requiredSameDir)
 				*continuescan = false;
 
 			/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6345e16d78d..7bfbf3086c8 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1056,6 +1056,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* flag indicating the first page in the scan */
+	bool		firstPage;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1255,7 +1258,8 @@ extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-						  int tupnatts, ScanDirection dir, bool *continuescan);
+						  int tupnatts, ScanDirection dir, bool *continuescan,
+						  bool requiredMatchedByPrecheck);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
-- 
2.37.1 (Apple Git-137.1)

#18

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Alexander Korotkov (#17)

Re: Index range search optimization

Hi!

On Fri, 29 Sept 2023 at 10:35, Alexander Korotkov <aekorotkov@gmail.com> wrote:

Hi, Peter.

On Fri, Sep 29, 2023 at 4:57 AM Peter Geoghegan <pg@bowt.ie> wrote:

On Fri, Sep 22, 2023 at 7:24 AM Alexander Korotkov <aekorotkov@gmail.com> wrote:

The thing is that NULLs could appear in the middle of matching values.

# WITH t (a, b) AS (VALUES ('a', 'b'), ('a', NULL), ('b', 'a'))
SELECT a, b, (a, b) > ('a', 'a') FROM t ORDER BY (a, b);
a | b | ?column?
---+------+----------
a | b | t
a | NULL | NULL
b | a | t
(3 rows)

So we can't just skip the row comparison operator, because we can meet
NULL at any place.

But why would SK_ROW_HEADER be any different? Is it related to this
existing case inside _bt_check_rowcompare()?:

if (subkey->sk_flags & SK_ISNULL)
{
/*
* Unlike the simple-scankey case, this isn't a disallowed case.
* But it can never match. If all the earlier row comparison
* columns are required for the scan direction, we can stop the
* scan, because there can't be another tuple that will succeed.
*/
if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
subkey--;
if ((subkey->sk_flags & SK_BT_REQFWD) &&
ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
return false;
}

Yes, exactly. Our row comparison operators don't match if there is any
null inside the row. But you can find these rows within the matching
range.

I noticed that you're not initializing so->firstPage correctly for the
_bt_endpoint() path, which is used when the initial position of the
scan is either the leftmost or rightmost page. That is, it's possible
to reach _bt_readpage() without having reached the point in
_bt_first() where you initialize so->firstPage to "true".

Good catch, thank you!

It would probably make sense if the flag was initialized to "false" in
the same way as most other scan state is already, somewhere in
nbtree.c. Probably in btrescan().

Makes sense, initialisation is added.

I've looked through the patch v8. I think it's good enough to be
pushed if Peter has no objections.

Regards,
Pavel.

#19

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Pavel Borisov (#18)

Re: Index range search optimization

On Wed, Oct 4, 2023 at 12:59 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

I've looked through the patch v8. I think it's good enough to be
pushed if Peter has no objections.

Thank you, Pavel.
I'll push this if there are no objections.

------
Regards,
Alexander Korotkov

#20

Konstantin Knizhnik

knizhnik@garret.ru

over 2 years ago

In reply to: Alexander Korotkov (#19)

Re: Index range search optimization

On 04/10/2023 3:00 am, Alexander Korotkov wrote:

On Wed, Oct 4, 2023 at 12:59 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

I've looked through the patch v8. I think it's good enough to be
pushed if Peter has no objections.

Thank you, Pavel.
I'll push this if there are no objections.

------
Regards,
Alexander Korotkov

Sorry, can you please also mention that original idea of this
optimization belongs to Ilya Anfimov (it was discussed in @pgsql
Telegram chat).

#21

Pavel Borisov

pashkin.elfe@gmail.com

over 2 years ago

In reply to: Konstantin Knizhnik (#20)

Re: Index range search optimization

Hi, Konstantin!

On Fri, 6 Oct 2023 at 22:44, Konstantin Knizhnik <knizhnik@garret.ru> wrote:

On 04/10/2023 3:00 am, Alexander Korotkov wrote:

On Wed, Oct 4, 2023 at 12:59 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

I've looked through the patch v8. I think it's good enough to be
pushed if Peter has no objections.

Thank you, Pavel.
I'll push this if there are no objections.

------
Regards,
Alexander Korotkov

Sorry, can you please also mention that original idea of this
optimization belongs to Ilya Anfimov (it was discussed in @pgsql
Telegram chat).

While it's no doubt correct to mention all authors of the patch, I
looked through the thread and saw no mentions of Ilya's
contributions/ideas before the patch became pushed. I'm not up to the
current policy for processing these requests, but I suppose it's
complicated to introduce back changes into the main branch that is
already ahead of patch e0b1ee17dc3a38.

Regards,
Pavel

#22

Alexander Korotkov

aekorotkov@gmail.com

over 2 years ago

In reply to: Pavel Borisov (#21)

Re: Index range search optimization

On Fri, Oct 6, 2023 at 9:59 PM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

On Fri, 6 Oct 2023 at 22:44, Konstantin Knizhnik <knizhnik@garret.ru> wrote:

On 04/10/2023 3:00 am, Alexander Korotkov wrote:

On Wed, Oct 4, 2023 at 12:59 AM Pavel Borisov <pashkin.elfe@gmail.com> wrote:

I've looked through the patch v8. I think it's good enough to be
pushed if Peter has no objections.

Thank you, Pavel.
I'll push this if there are no objections.

------
Regards,
Alexander Korotkov

Sorry, can you please also mention that original idea of this
optimization belongs to Ilya Anfimov (it was discussed in @pgsql
Telegram chat).

While it's no doubt correct to mention all authors of the patch, I
looked through the thread and saw no mentions of Ilya's
contributions/ideas before the patch became pushed. I'm not up to the
current policy for processing these requests, but I suppose it's
complicated to introduce back changes into the main branch that is
already ahead of patch e0b1ee17dc3a38.

Yep, that happened before. We don't do a force push to override
commit messages and credit missing contributors. I waited more than
48 hours before pushing the final version of the patch, and that was
the time to propose changes like this. Now, I think all we can do is
credit Ilya on mailing lists. I believe we already did :)

------
Regards,
Alexander Korotkov