From f2859bb02f1c238d3a15e3d9b2de6a01c3543953 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 4 May 2019 19:22:38 -0700 Subject: [PATCH] Remove downlink variable from nbtree stack. --- src/backend/access/nbtree/nbtinsert.c | 40 +++++++++++++++++++-------- src/backend/access/nbtree/nbtpage.c | 3 +- src/backend/access/nbtree/nbtsearch.c | 17 ++++-------- src/include/access/nbtree.h | 15 ++++------ 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 602f8849d4..059df74103 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1795,7 +1795,6 @@ _bt_insert_parent(Relation rel, stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; - stack->bts_btentry = InvalidBlockNumber; stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } @@ -1817,8 +1816,7 @@ _bt_insert_parent(Relation rel, * new downlink will be inserted at the correct offset. Even buf's * parent may have changed. */ - stack->bts_btentry = bknum; - pbuf = _bt_getstackbuf(rel, stack); + pbuf = _bt_getstackbuf(rel, stack, bknum); /* * Now we can unlock the right child. The left child will be unlocked @@ -1897,13 +1895,31 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) } /* - * _bt_getstackbuf() -- Walk back up the tree one step, and find the item - * we last looked at in the parent. + * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot + * tuple whose downlink points to child page. * - * This is possible because we save the downlink from the parent item, - * which is enough to uniquely identify it. Insertions into the parent - * level could cause the item to move right; deletions could cause it - * to move left, but not left of the page we previously found it in. + * Caller passes child's block number, which is enough to uniquely + * identify it using a linear search that matches it to a downlink in + * parent. Details of the parent are taken from stack for parent + * level, which was stashed during initial descent. + * + * The details of the parent page/downlink from stack are inherently + * approximate, though child's downlink can typically be relocated very + * quickly. Insertions into the parent level could cause the pivot + * tuple to move right; deletions could cause it to move left, but not + * left of the page we previously found it on. + * + * Note also that it's even possible that located pivot tuple is not + * even _bt_compare()-equal to the pivot tuple whose downlink was + * originally followed. For example, caller may have had to move right + * or step right earlier, and will now require a downlink to a child + * page that is actually a sibling of the page encountered when + * downlink was followed during descent. + * + * Caller should have a lock on child's buffer, too, since caller's + * stack might otherwise go stale immediately. (Page deletion caller + * can get away with a lock on leaf level page when locating topparent + * downlink, though.) * * Adjusts bts_blkno & bts_offset if changed. * @@ -1911,7 +1927,7 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) * (should not happen). */ Buffer -_bt_getstackbuf(Relation rel, BTStack stack) +_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child) { BlockNumber blkno; OffsetNumber start; @@ -1973,7 +1989,7 @@ _bt_getstackbuf(Relation rel, BTStack stack) itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); - if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry) + if (BTreeInnerTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; @@ -1989,7 +2005,7 @@ _bt_getstackbuf(Relation rel, BTStack stack) itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); - if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry) + if (BTreeInnerTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 50455db9af..2a65db3613 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1228,8 +1228,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, * non-unique high keys in leaf level pages. Even heapkeyspace indexes * can have a stale stack due to insertions into the parent. */ - stack->bts_btentry = child; - pbuf = _bt_getstackbuf(rel, stack); + pbuf = _bt_getstackbuf(rel, stack, child); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), child); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index c655dadb96..cc6ccf2b4c 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -148,21 +148,16 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use the - * stack to work back up to the parent page. We also save the actual - * downlink (block) to uniquely identify the index entry, in case it - * moves right while we're working lower in the tree. See the paper - * by Lehman and Yao for how this is detected and handled. (We use the - * child link during the second half of a page split -- if caller ends - * up splitting the child it usually ends up inserting a new pivot - * tuple for child's new right sibling immediately after the original - * bts_offset offset recorded here. The downlink block will be needed - * to check if bts_offset remains the position of this same pivot - * tuple.) + * stack to work back up to the parent page. We use the child block + * number (or possibly the block number of a page to its right) + * during the second half of a page split -- if caller ends up + * splitting the child it usually ends up inserting a new pivot tuple + * for child's new right sibling immediately after the original + * bts_offset offset recorded here. */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; - new_stack->bts_btentry = blkno; new_stack->bts_parent = stack_in; /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index a3583f225b..ad7963cba8 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -403,20 +403,15 @@ typedef struct BTMetaPageData #define BT_WRITE BUFFER_LOCK_EXCLUSIVE /* - * BTStackData -- As we descend a tree, we push the (location, downlink) - * pairs from internal pages onto a private stack. If we split a - * leaf, we use this stack to walk back up the tree and insert data - * into parent pages (and possibly to split them, too). Lehman and - * Yao's update algorithm guarantees that under no circumstances can - * our private stack give us an irredeemably bad picture up the tree. - * Again, see the paper for details. + * BTStackData -- As we descend a tree, we push the location of pivot + * tuples whose downlinks are followed from internal pages onto a private + * stack. If we split a leaf, we use this stack to walk back up the tree + * and insert data into parent pages (and possibly to split them, too). */ - typedef struct BTStackData { BlockNumber bts_blkno; OffsetNumber bts_offset; - BlockNumber bts_btentry; struct BTStackData *bts_parent; } BTStackData; @@ -731,7 +726,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); */ extern bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel); -extern Buffer _bt_getstackbuf(Relation rel, BTStack stack); +extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child); extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); /* -- 2.17.1