From 74fc87b55ef23eeac8f6857e87746f96e4601983 Mon Sep 17 00:00:00 2001 From: Pavan Deolasee Date: Wed, 11 Apr 2018 00:30:22 +0530 Subject: [PATCH] Post-commit cleanup for B-Tree optimization work. This addresses review comments by Peter Geoghegan, especially the fact that we must not be calling RelationSetTargetBlock() within a critical section and adding a small note to the README explaining the optimization. This also includes a fix for an oversight spotted by Heikki Linnakangas regarding the optimization not getting used for UNLOGGED relations. --- src/backend/access/nbtree/README | 19 +++++++++++ src/backend/access/nbtree/nbtinsert.c | 60 +++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index aef455c122..cc728b900a 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -375,6 +375,25 @@ positives, so long as it never gives a false negative. This makes it possible to implement the test with a small counter value stored on each index page. +Fastpath For Index Insertion +---------------------------- + +We optimize for a common case of insertion of increasing index key +values by caching the last page to which this backend inserted the last +value, if this page was the rightmost leaf page. For the next insert, we +can then quickly check if the cached page is still the rightmost leaf +page and also the correct place to hold the current value. We can avoid +the cost of walking down the tree in such common cases. + +The optimization works on the assumption that there can only be one +non-ignorable leaf rightmost page, and so even a RecentGlobalXmin style +interlock isn't required. We cannot fail to detect that our hint was +invalidated, because there can only be one such page in the B-Tree at +any time. It's possible that the page will be deleted and recycled +without a backend's cached page also being detected as invalidated, but +only when we happen to recycle a page that once again becomes the +rightmost leaf page. + On-the-Fly Deletion Of Index Tuples ----------------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 0b93acd024..28b653d4fc 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -176,13 +176,17 @@ top: * the first key on the page. */ if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && - !P_INCOMPLETE_SPLIT(lpageop) && !P_IGNORE(lpageop) && (PageGetFreeSpace(page) > itemsz) && PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && _bt_compare(rel, indnkeyatts, itup_scankey, page, P_FIRSTDATAKEY(lpageop)) > 0) { + /* + * The right-most block should never have incomplete split. But + * be paranoid and check for it anyways. + */ + Assert(!P_INCOMPLETE_SPLIT(lpageop)); fastpath = true; } else @@ -868,6 +872,24 @@ _bt_insertonpg(Relation rel, bool newitemonleft; Buffer rbuf; + /* + * If we're here then a pagesplit is needed. We should never reach here + * if we're using the fastpath since we should have checked for all the + * required conditions, including the fact that this page has enough + * freespace. Note that this routine can in-theory deal with the + * situation where a NULL stack pointer is passed (that's what would + * happen if the fastpath is taken), like it does during crash + * recovery. But that path is much slower, defeating the very purpose + * of the optimisation. The following assertion should protect us from + * any future code changes that invalidate those assumptions. + * + * Note the fact that whenever we fail to take the fastpath, we clear + * the cached block. So checking for a valid cached block at this point + * is enough to decide whether we're in a fastpath or not. + */ + Assert(!(P_ISLEAF(lpageop) && + BlockNumberIsValid(RelationGetTargetBlock(rel)))); + /* Choose the split point */ firstright = _bt_findsplitloc(rel, page, newitemoff, itemsz, @@ -905,6 +927,7 @@ _bt_insertonpg(Relation rel, BTMetaPageData *metad = NULL; OffsetNumber itup_off; BlockNumber itup_blkno; + BlockNumber cachedBlock = InvalidBlockNumber; itup_off = newitemoff; itup_blkno = BufferGetBlockNumber(buf); @@ -962,6 +985,15 @@ _bt_insertonpg(Relation rel, MarkBufferDirty(cbuf); } + /* + * Cache the block information if we just inserted into the rightmost + * leaf page of the index and it's not the root page. For very small + * index where root is also the leaf, there is no point trying for any + * optimisation. + */ + if (P_RIGHTMOST(lpageop) && !P_ISROOT(lpageop)) + cachedBlock = BufferGetBlockNumber(buf); + /* XLOG stuff */ if (RelationNeedsWAL(rel)) { @@ -977,16 +1009,7 @@ _bt_insertonpg(Relation rel, XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (P_ISLEAF(lpageop)) - { xlinfo = XLOG_BTREE_INSERT_LEAF; - - /* - * Cache the block information if we just inserted into the - * rightmost leaf page of the index. - */ - if (P_RIGHTMOST(lpageop)) - RelationSetTargetBlock(rel, BufferGetBlockNumber(buf)); - } else { /* @@ -1048,6 +1071,23 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(cbuf)) _bt_relbuf(rel, cbuf); _bt_relbuf(rel, buf); + + /* + * If we decided to cache the insertion target block, then set it now. + * But before that, check for the height of the tree and don't go for + * the optimisation for small indexes. We defer that check to this + * point to ensure that we don't call _bt_getrootheight while holding + * lock on any other block. + * + * We do this after dropping locks on all buffers. So the information + * about whether the insertion block is still the rightmost block or + * not may have changed in between. But we will deal with that during + * next insert operation. No special care is required while setting it. + */ +#define BTREE_FASTPATH_MIN_LEVEL 2 + if (BlockNumberIsValid(cachedBlock) && + _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) + RelationSetTargetBlock(rel, cachedBlock); } } -- 2.14.3 (Apple Git-98)