From 74fc87b55ef23eeac8f6857e87746f96e4601983 Mon Sep 17 00:00:00 2001
From: Pavan Deolasee <pavan.deolasee@gmail.com>
Date: Wed, 11 Apr 2018 00:30:22 +0530
Subject: [PATCH] Post-commit cleanup for B-Tree optimization work.

This addresses review comments by Peter Geoghegan, especially the fact that we
must not be calling RelationSetTargetBlock() within a critical section and
adding a small note to the README explaining the optimization.

This also includes a fix for an oversight spotted by Heikki Linnakangas
regarding the optimization not getting used for UNLOGGED relations.
---
 src/backend/access/nbtree/README      | 19 +++++++++++
 src/backend/access/nbtree/nbtinsert.c | 60 +++++++++++++++++++++++++++++------
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index aef455c122..cc728b900a 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -375,6 +375,25 @@ positives, so long as it never gives a false negative.  This makes it
 possible to implement the test with a small counter value stored on each
 index page.
 
+Fastpath For Index Insertion
+----------------------------
+
+We optimize for a common case of insertion of increasing index key
+values by caching the last page to which this backend inserted the last
+value, if this page was the rightmost leaf page. For the next insert, we
+can then quickly check if the cached page is still the rightmost leaf
+page and also the correct place to hold the current value. We can avoid
+the cost of walking down the tree in such common cases.
+
+The optimization works on the assumption that there can only be one
+non-ignorable leaf rightmost page, and so even a RecentGlobalXmin style
+interlock isn't required.  We cannot fail to detect that our hint was
+invalidated, because there can only be one such page in the B-Tree at
+any time. It's possible that the page will be deleted and recycled
+without a backend's cached page also being detected as invalidated, but
+only when we happen to recycle a page that once again becomes the
+rightmost leaf page.
+
 On-the-Fly Deletion Of Index Tuples
 -----------------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 0b93acd024..28b653d4fc 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -176,13 +176,17 @@ top:
 			 * the first key on the page.
 			 */
 			if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
-				!P_INCOMPLETE_SPLIT(lpageop) &&
 				!P_IGNORE(lpageop) &&
 				(PageGetFreeSpace(page) > itemsz) &&
 				PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
 				_bt_compare(rel, indnkeyatts, itup_scankey, page,
 							P_FIRSTDATAKEY(lpageop)) > 0)
 			{
+				/*
+				 * The right-most block should never have incomplete split. But
+				 * be paranoid and check for it anyways.
+				 */
+				Assert(!P_INCOMPLETE_SPLIT(lpageop));
 				fastpath = true;
 			}
 			else
@@ -868,6 +872,24 @@ _bt_insertonpg(Relation rel,
 		bool		newitemonleft;
 		Buffer		rbuf;
 
+		/*
+		 * If we're here then a pagesplit is needed. We should never reach here
+		 * if we're using the fastpath since we should have checked for all the
+		 * required conditions, including the fact that this page has enough
+		 * freespace. Note that this routine can in-theory deal with the
+		 * situation where a NULL stack pointer is passed (that's what would
+		 * happen if the fastpath is taken), like it does during crash
+		 * recovery. But that path is much slower, defeating the very purpose
+		 * of the optimisation.  The following assertion should protect us from
+		 * any future code changes that invalidate those assumptions.
+		 *
+		 * Note the fact that whenever we fail to take the fastpath, we clear
+		 * the cached block. So checking for a valid cached block at this point
+		 * is enough to decide whether we're in a fastpath or not.
+		 */
+		Assert(!(P_ISLEAF(lpageop) &&
+				BlockNumberIsValid(RelationGetTargetBlock(rel))));
+
 		/* Choose the split point */
 		firstright = _bt_findsplitloc(rel, page,
 									  newitemoff, itemsz,
@@ -905,6 +927,7 @@ _bt_insertonpg(Relation rel,
 		BTMetaPageData *metad = NULL;
 		OffsetNumber itup_off;
 		BlockNumber itup_blkno;
+		BlockNumber	cachedBlock = InvalidBlockNumber;
 
 		itup_off = newitemoff;
 		itup_blkno = BufferGetBlockNumber(buf);
@@ -962,6 +985,15 @@ _bt_insertonpg(Relation rel,
 			MarkBufferDirty(cbuf);
 		}
 
+		/*
+		 * Cache the block information if we just inserted into the rightmost
+		 * leaf page of the index and it's not the root page.  For very small
+		 * index where root is also the leaf, there is no point trying for any
+		 * optimisation.
+		 */
+		if (P_RIGHTMOST(lpageop) && !P_ISROOT(lpageop))
+			cachedBlock = BufferGetBlockNumber(buf);
+
 		/* XLOG stuff */
 		if (RelationNeedsWAL(rel))
 		{
@@ -977,16 +1009,7 @@ _bt_insertonpg(Relation rel,
 			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
 
 			if (P_ISLEAF(lpageop))
-			{
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
-
-				/*
-				 * Cache the block information if we just inserted into the
-				 * rightmost leaf page of the index.
-				 */
-				if (P_RIGHTMOST(lpageop))
-					RelationSetTargetBlock(rel, BufferGetBlockNumber(buf));
-			}
 			else
 			{
 				/*
@@ -1048,6 +1071,23 @@ _bt_insertonpg(Relation rel,
 		if (BufferIsValid(cbuf))
 			_bt_relbuf(rel, cbuf);
 		_bt_relbuf(rel, buf);
+
+		/*
+		 * If we decided to cache the insertion target block, then set it now.
+		 * But before that, check for the height of the tree and don't go for
+		 * the optimisation for small indexes. We defer that check to this
+		 * point to ensure that we don't call _bt_getrootheight while holding
+		 * lock on any other block.
+		 *
+		 * We do this after dropping locks on all buffers. So the information
+		 * about whether the insertion block is still the rightmost block or
+		 * not may have changed in between. But we will deal with that during
+		 * next insert operation. No special care is required while setting it.
+		 */
+#define BTREE_FASTPATH_MIN_LEVEL	2
+		if (BlockNumberIsValid(cachedBlock) &&
+			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
+			RelationSetTargetBlock(rel, cachedBlock);
 	}
 }
 
-- 
2.14.3 (Apple Git-98)