*** a/src/backend/access/nbtree/README
--- b/src/backend/access/nbtree/README
***************
*** 168,173 **** parent item does still exist and can't have been deleted.  Also, because
--- 168,176 ----
  we are matching downlink page numbers and not data keys, we don't have any
  problem with possibly misidentifying the parent item.
  
+ Page Deletion
+ -------------
+ 
  We consider deleting an entire page from the btree only when it's become
  completely empty of items.  (Merging partly-full pages would allow better
  space reuse, but it seems impractical to move existing data items left or
***************
*** 176,193 **** might miss the items if so.)  Also, we *never* delete the rightmost page
  on a tree level (this restriction simplifies the traversal algorithms, as
  explained below).
  
! To delete an empty page, we acquire write lock on its left sibling (if
! any), the target page itself, the right sibling (there must be one), and
! the parent page, in that order.  The parent page must be found using the
! same type of search as used to find the parent during an insertion split.
! Then we update the side-links in the siblings, mark the target page
! deleted, and remove the downlink from the parent, as well as the parent's
! upper bounding key for the target (the one separating it from its right
! sibling).  This causes the target page's key space to effectively belong
! to its right sibling.  (Neither the left nor right sibling pages need to
! change their "high key" if any; so there is no problem with possibly not
! having enough space to replace a high key.)  The side-links in the target
! page are not changed.
  
  (Note: Lanin and Shasha prefer to make the key space move left, but their
  argument for doing so hinges on not having left-links, which we have
--- 179,198 ----
  on a tree level (this restriction simplifies the traversal algorithms, as
  explained below).
  
! Page deletion always begins at the leaf level. An internal page can only be
! deleted as part of a branch leading to a leaf page, where each internal page
! has only one child and that child is also to be deleted.
! 
! Page deletion is a two-stage process. First, the parent page must be found
! using the same type of search as used to find the parent during an insertion
! split. We lock the target and the parent pages, change the target's downlink
! to point to the right sibling, and remove its old downlink. This causes the
! target page's key space to effectively belong to its right sibling.  (Neither
! the left nor right sibling pages need to change their "high key" if any; so
! there is no problem with possibly not having enough space to replace a high
! key.)  At the same time, we mark the target page as half-dead, which causes
! any subsequent searches to ignore it and move right (or left, in a backwards
! scan).
  
  (Note: Lanin and Shasha prefer to make the key space move left, but their
  argument for doing so hinges on not having left-links, which we have
***************
*** 199,229 **** the same parent --- otherwise, the parent's key space assignment changes
  too, meaning we'd have to make bounding-key updates in its parent, and
  perhaps all the way up the tree.  Since we can't possibly do that
  atomically, we forbid this case.  That means that the rightmost child of a
! parent node can't be deleted unless it's the only remaining child.
! 
! When we delete the last remaining child of a parent page, we mark the
! parent page "half-dead" as part of the atomic update that deletes the
! child page.  This implicitly transfers the parent's key space to its right
! sibling (which it must have, since we never delete the overall-rightmost
! page of a level).  Searches ignore the half-dead page and immediately move
! right.  We need not worry about insertions into a half-dead page --- insertions
! into upper tree levels happen only as a result of splits of child pages, and
! the half-dead page no longer has any children that could split.  Therefore
! the page stays empty even when we don't have lock on it, and we can complete
! its deletion in a second atomic action.
! 
! The notion of a half-dead page means that the key space relationship between
! the half-dead page's level and its parent's level may be a little out of
! whack: key space that appears to belong to the half-dead page's parent on the
! parent level may really belong to its right sibling.  To prevent any possible
! problems, we hold lock on the deleted child page until we have finished
! deleting any now-half-dead parent page(s).  This prevents any insertions into
! the transferred keyspace until the operation is complete.  The reason for
! doing this is that a sufficiently large number of insertions into the
! transferred keyspace, resulting in multiple page splits, could propagate keys
! from that keyspace into the parent level, resulting in transiently
! out-of-order keys in that level.  It is thought that that wouldn't cause any
! serious problem, but it seems too risky to allow.
  
  A deleted page cannot be reclaimed immediately, since there may be other
  processes waiting to reference it (ie, search processes that just left the
--- 204,246 ----
  too, meaning we'd have to make bounding-key updates in its parent, and
  perhaps all the way up the tree.  Since we can't possibly do that
  atomically, we forbid this case.  That means that the rightmost child of a
! parent node can't be deleted unless it's the only remaining child, in which
! case we will delete the parent too.
! 
! When we're about to delete the last remaining child of a parent page, we
! leave the parent page alone and remove the downlink to the parent page
! instead, in the grandparent. If it's the last child of the grandparent too,
! we recurse up while keeping the leaf page locked, until we find a parent with
! more than one child. We don't need to hold locks on the intermediate pages
! between the leaf and the final parent -- insertions into upper tree levels
! happen only as a result of splits of child pages, and we never split a
! half-dead leaf page. Removing the downlink to the top of the to-be-deleted
! branch effectively transfers the key space to the right sibling for all the
! intermediate levels too, in one atomic operation. A concurrent search might
! still visit the intermediate pages, but it will move right when it reaches
! the half-dead page at the leaf level.
! 
! When we mark the leaf page as half-dead, we also store a link to the top
! of the to-be-deleted branch, ie. the block number of the page whose downlink
! we removed. This is needed in the second phase, to unlink the pages from
! their siblings.
! 
! The second phase is to unlink the half-dead leaf page, and any internal pages
! in the to-be-deleted branch. If the half-dead page still has a parent pointing
! to it, ie. if we're removing a whole branch, we unlink the topmost remaining
! parent first. We use the link we stashed in the half-dead leaf page to get its
! block number. That is the target of the unlink operation. We first lock the
! leaf page (unless we're removing the leaf), then the left sibling (if any) of
! the target, the target page itself, and its right sibling (there must be one)
! in that order. Then we update the side-links in the siblings, and mark the
! target page deleted. If the target was the leaf page, we're done. Otherwise we
! update the top-link in leaf page with the next page in the branch we're
! deleting, ie. the only child of the target page. The side-links in the target
! page are not changed.
! 
! This is repeated, chewing pages in the branch top-down, until all the internal
! pages above the half-dead leaf page have been removed. Finally, the half-dead
! leaf page itself is removed.
  
  A deleted page cannot be reclaimed immediately, since there may be other
  processes waiting to reference it (ie, search processes that just left the
***************
*** 396,407 **** metapage update (of the "fast root" link) is performed and logged as part
  of the insertion into the parent level.  When splitting the root page, the
  metapage update is handled as part of the "new root" action.
  
! A page deletion is logged as a single WAL entry covering all four
! required page updates (target page, left and right siblings, and parent)
! as an atomic event.  (Any required fast-root link update is also part
! of the WAL entry.)  If the parent page becomes half-dead but is not
! immediately deleted due to a subsequent crash, there is no loss of
! consistency, and the empty page will be picked up by the next VACUUM.
  
  Scans during Recovery
  ---------------------
--- 413,423 ----
  of the insertion into the parent level.  When splitting the root page, the
  metapage update is handled as part of the "new root" action.
  
! Each step in page deletion are logged as separate WAL entries: marking the
! leaf as half-dead and removing the downlink is one record, and unlinking a
! page is a second record. If vacuum is interrupted for some reason, or the
! system crashes, the tree is consistent for searches and insertions. The next
! VACUUM will find the half-dead leaf page and continue the deletion.
  
  Scans during Recovery
  ---------------------
*** a/src/backend/access/nbtree/nbtpage.c
--- b/src/backend/access/nbtree/nbtpage.c
***************
*** 31,36 ****
--- 31,44 ----
  #include "utils/inval.h"
  #include "utils/snapmgr.h"
  
+ static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
+ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
+ 						 bool *rightsib_empty);
+ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
+ 					   BTStack stack, Buffer *topparent, OffsetNumber *topoff,
+ 					   BlockNumber *target, BlockNumber *rightsib);
+ static void  _bt_log_reuse_page(Relation rel, BlockNumber blkno,
+ 				   TransactionId latestRemovedXid);
  
  /*
   *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
***************
*** 954,977 **** _bt_delitems_delete(Relation rel, Buffer buf,
  }
  
  /*
!  * Subroutine to pre-check whether a page deletion is safe, that is, its
!  * parent page would be left in a valid or deletable state.
   *
!  * "target" is the page we wish to delete, and "stack" is a search stack
   * leading to it (approximately).  Note that we will update the stack
   * entry(s) to reflect current downlink positions --- this is harmless and
!  * indeed saves later search effort in _bt_pagedel.
   *
!  * Note: it's OK to release page locks after checking, because a safe
!  * deletion can't become unsafe due to concurrent activity.  A non-rightmost
!  * page cannot become rightmost unless there's a concurrent page deletion,
!  * but only VACUUM does page deletion and we only allow one VACUUM on an index
!  * at a time.  An only child could acquire a sibling (of the same parent) only
!  * by being split ... but that would make it a non-rightmost child so the
!  * deletion is still safe.
   */
  static bool
! _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
  {
  	BlockNumber parent;
  	OffsetNumber poffset,
--- 962,997 ----
  }
  
  /*
!  * Subroutine to find the parent of the branch we're deleting.  This climbs
!  * up the tree until it finds a page with more than one child, ie. a page
!  * that will not be totally emptied by the deletion.  The chain of pages below
!  * it, with one downlink each, will be part of the branch that we need to
!  * delete.
   *
!  * If we cannot remove the downlink from the parent, because it's the
!  * rightmost entry, returns false.  On success, *topparent and *topoff are set
!  * to the buffer holding the parent, and the offsetnumber of the downlink in
!  * it.  The parent buffer is write-locked, the caller is responsible for
!  * releasing it when done.  *target is set to the topmost page in the branch
!  * to-be-deleted, ie. the page whose downlink *topparent / *topoff point to,
!  * and *rightsib to its right sibling.
!  *
!  * "child" is the leaf page we wish to delete, and "stack" is a search stack
   * leading to it (approximately).  Note that we will update the stack
   * entry(s) to reflect current downlink positions --- this is harmless and
!  * indeed saves later search effort in _bt_pagedel.  The caller should
!  * initialize *target and *rightsib to the leaf page and its right sibling.
   *
!  * Note: it's OK to release page locks on any internal pages between the leaf
!  * and *topparent, because a safe deletion can't become unsafe due to
!  * concurrent activity.  An internal page can only acquire an entry if the
!  * child is split, but that cannot happen as long as we hold a lock on the
!  * leaf.
   */
  static bool
! _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
! 					   Buffer *topparent, OffsetNumber *topoff,
! 					   BlockNumber *target, BlockNumber *rightsib)
  {
  	BlockNumber parent;
  	OffsetNumber poffset,
***************
*** 980,999 **** _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
  	Page		page;
  	BTPageOpaque opaque;
  
- 	/*
- 	 * In recovery mode, assume the deletion being replayed is valid.  We
- 	 * can't always check it because we won't have a full search stack, and we
- 	 * should complain if there's a problem, anyway.
- 	 */
- 	if (InRecovery)
- 		return true;
- 
  	/* Locate the parent's downlink (updating the stack entry if needed) */
! 	ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
  	pbuf = _bt_getstackbuf(rel, stack, BT_READ);
  	if (pbuf == InvalidBuffer)
  		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
! 			 RelationGetRelationName(rel), target);
  	parent = stack->bts_blkno;
  	poffset = stack->bts_offset;
  
--- 1000,1011 ----
  	Page		page;
  	BTPageOpaque opaque;
  
  	/* Locate the parent's downlink (updating the stack entry if needed) */
! 	ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
  	pbuf = _bt_getstackbuf(rel, stack, BT_READ);
  	if (pbuf == InvalidBuffer)
  		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
! 			 RelationGetRelationName(rel), child);
  	parent = stack->bts_blkno;
  	poffset = stack->bts_offset;
  
***************
*** 1021,1028 **** _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
  				return false;
  			}
  
  			_bt_relbuf(rel, pbuf);
! 			return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
  		}
  		else
  		{
--- 1033,1044 ----
  				return false;
  			}
  
+ 			*target = child;
+ 			*rightsib = opaque->btpo_next;
+ 
  			_bt_relbuf(rel, pbuf);
! 			return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
! 										  topparent, topoff, target, rightsib);
  		}
  		else
  		{
***************
*** 1034,1040 **** _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
  	else
  	{
  		/* Not rightmost child, so safe to delete */
! 		_bt_relbuf(rel, pbuf);
  		return true;
  	}
  }
--- 1050,1057 ----
  	else
  	{
  		/* Not rightmost child, so safe to delete */
! 		*topparent = pbuf;
! 		*topoff = poffset;
  		return true;
  	}
  }
***************
*** 1057,1063 **** _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
   * to avoid duplicated search effort.
   *
   * Returns the number of pages successfully deleted (zero if page cannot
!  * be deleted now; could be more than one if parent pages were deleted too).
   *
   * NOTE: this leaks memory.  Rather than trying to clean up everything
   * carefully, it's better to run it in a temp context that can be reset
--- 1074,1081 ----
   * to avoid duplicated search effort.
   *
   * Returns the number of pages successfully deleted (zero if page cannot
!  * be deleted now; could be more than one if parent or sibling pages were
!  * deleted too).
   *
   * NOTE: this leaks memory.  Rather than trying to clean up everything
   * carefully, it's better to run it in a temp context that can be reset
***************
*** 1066,1102 **** _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
  int
  _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  {
! 	int			result;
! 	BlockNumber target,
! 				leftsib,
! 				rightsib,
! 				parent;
! 	OffsetNumber poffset,
! 				maxoff;
! 	uint32		targetlevel,
! 				ilevel;
! 	ItemId		itemid;
! 	IndexTuple	targetkey,
! 				itup;
! 	ScanKey		itup_scankey;
! 	Buffer		lbuf,
! 				rbuf,
! 				pbuf;
! 	bool		parent_half_dead;
! 	bool		parent_one_child;
  	bool		rightsib_empty;
- 	Buffer		metabuf = InvalidBuffer;
- 	Page		metapg = NULL;
- 	BTMetaPageData *metad = NULL;
  	Page		page;
  	BTPageOpaque opaque;
  
  	/*
  	 * We can never delete rightmost pages nor root pages.	While at it, check
  	 * that page is not already deleted and is empty.
  	 */
- 	page = BufferGetPage(buf);
- 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
  		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
  	{
--- 1084,1128 ----
  int
  _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  {
! 	int			ndeleted = 0;
! 	BlockNumber rightsib;
  	bool		rightsib_empty;
  	Page		page;
  	BTPageOpaque opaque;
  
+ retry:
+ 	page = BufferGetPage(buf);
+ 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ 
+ 	/*
+ 	 * Internal pages are never deleted directly, only as part of deleting
+ 	 * the whole branch all the way down to leaf level.
+ 	 */
+ 	if (!P_ISLEAF(opaque))
+ 	{
+ 		/*
+ 		 * Pre-9.4 page deletion only marked internal pages as half-dead, but
+ 		 * now we only use that flag on leaf pages. The old algorithm was
+ 		 * never supposed to leave half-dead pages in the tree, it was just
+ 		 * a transient state, but it was nevertheless possible in error
+ 		 * scenarios. We don't know how to deal with them here. They are
+ 		 * harmless as far as searches are considered, but inserts into the
+ 		 * deleted keyspace could add out-of-order downlinks in the upper
+ 		 * levels. Log a notice, hopefully the admin will notice and reindex.
+ 		 */
+ 		if (P_ISHALFDEAD(opaque))
+ 			ereport(LOG,
+ 					(errcode(ERRCODE_INDEX_CORRUPTED),
+ 					 errmsg("index \"%s\" contains a half-dead internal page",
+ 							RelationGetRelationName(rel)),
+ 					 errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+ 		return 0;
+ 	}
+ 
  	/*
  	 * We can never delete rightmost pages nor root pages.	While at it, check
  	 * that page is not already deleted and is empty.
  	 */
  	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
  		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
  	{
***************
*** 1108,1127 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	}
  
  	/*
  	 * Save info about page, including a copy of its high key (it must have
  	 * one, being non-rightmost).
  	 */
! 	target = BufferGetBlockNumber(buf);
! 	targetlevel = opaque->btpo.level;
! 	leftsib = opaque->btpo_prev;
  	itemid = PageGetItemId(page, P_HIKEY);
  	targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
  
  	/*
! 	 * To avoid deadlocks, we'd better drop the target page lock before going
  	 * further.
  	 */
! 	_bt_relbuf(rel, buf);
  
  	/*
  	 * We need an approximate pointer to the page's parent page.  We use the
--- 1134,1226 ----
  	}
  
  	/*
+ 	 * First, remove downlink pointing to the page (or a parent of the page,
+ 	 * if we are going to delete a taller branch), and mark the page as
+ 	 * half-dead.
+ 	 */
+ 	if (!P_ISHALFDEAD(opaque))
+ 	{
+ 		if (!_bt_mark_page_halfdead(rel, buf, stack))
+ 		{
+ 			_bt_relbuf(rel, buf);
+ 			return 0;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Then unlink it from its siblings. Each call to _bt_unlink_halfdead_pages
+ 	 * unlinks a single page from the branch, making it shallower. Iterate
+ 	 * until the leaf page is gone.
+ 	 */
+ 	rightsib_empty = false;
+ 	while (P_ISHALFDEAD(opaque))
+ 	{
+ 		if (_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
+ 			ndeleted++;
+ 		else
+ 			break;
+ 	}
+ 
+ 	rightsib = opaque->btpo_next;
+ 
+ 	_bt_relbuf(rel, buf);
+ 
+ 	/*
+ 	 * The page has now been deleted. If its right sibling is completely
+ 	 * empty, it's possible that the reason we haven't deleted it earlier is
+ 	 * that it was the rightmost child of the parent. Now that we removed the
+ 	 * downlink for this page, the right sibling might now be the only child
+ 	 * of the parent, and could be removed. It would be picked up by the next
+ 	 * vacuum anyway, but might as well try to remove it now.
+ 	 *
+ 	 * XXX: this could recurse very deeply. Turn this into iteration to
+ 	 * avoid running out of stack.
+ 	 */
+ 	if (rightsib_empty)
+ 	{
+ 		buf = _bt_getbuf(rel, rightsib, BT_WRITE);
+ 		goto retry;
+ 	}
+ 
+ 	return ndeleted;
+ }
+ 
+ static bool
+ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
+ {
+ 	BlockNumber	leafblkno;
+ 	BlockNumber leafrightsib;
+ 	BlockNumber	target;
+ 	BlockNumber rightsib;
+ 	ItemId		itemid;
+ 	Page		page;
+ 	BTPageOpaque opaque;
+ 	Buffer		topparent;
+ 	OffsetNumber topoff;
+ 	OffsetNumber nextoffset;
+ 	IndexTuple	itup;
+ 	IndexTuple	targetkey;
+ 
+ 	page = BufferGetPage(leafbuf);
+ 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ 
+ 	Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
+ 		   !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
+ 		   P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+ 
+ 	/*
  	 * Save info about page, including a copy of its high key (it must have
  	 * one, being non-rightmost).
  	 */
! 	leafblkno = BufferGetBlockNumber(leafbuf);
  	itemid = PageGetItemId(page, P_HIKEY);
  	targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
  
  	/*
! 	 * To avoid deadlocks, we'd better drop the leaf page lock before going
  	 * further.
  	 */
! 	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
  
  	/*
  	 * We need an approximate pointer to the page's parent page.  We use the
***************
*** 1133,1201 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	 */
  	if (stack == NULL)
  	{
! 		if (!InRecovery)
! 		{
! 			/* we need an insertion scan key to do our search, so build one */
! 			itup_scankey = _bt_mkscankey(rel, targetkey);
! 			/* find the leftmost leaf page containing this key */
! 			stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
! 							   &lbuf, BT_READ);
! 			/* don't need a pin on that either */
! 			_bt_relbuf(rel, lbuf);
  
! 			/*
! 			 * If we are trying to delete an interior page, _bt_search did
! 			 * more than we needed.  Locate the stack item pointing to our
! 			 * parent level.
! 			 */
! 			ilevel = 0;
! 			for (;;)
! 			{
! 				if (stack == NULL)
! 					elog(ERROR, "not enough stack items");
! 				if (ilevel == targetlevel)
! 					break;
! 				stack = stack->bts_parent;
! 				ilevel++;
! 			}
! 		}
! 		else
! 		{
! 			/*
! 			 * During WAL recovery, we can't use _bt_search (for one reason,
! 			 * it might invoke user-defined comparison functions that expect
! 			 * facilities not available in recovery mode).	Instead, just set
! 			 * up a dummy stack pointing to the left end of the parent tree
! 			 * level, from which _bt_getstackbuf will walk right to the parent
! 			 * page.  Painful, but we don't care too much about performance in
! 			 * this scenario.
! 			 */
! 			pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
! 			stack = (BTStack) palloc(sizeof(BTStackData));
! 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
! 			stack->bts_offset = InvalidOffsetNumber;
! 			/* bts_btentry will be initialized below */
! 			stack->bts_parent = NULL;
! 			_bt_relbuf(rel, pbuf);
! 		}
  	}
  
  	/*
  	 * We cannot delete a page that is the rightmost child of its immediate
  	 * parent, unless it is the only child --- in which case the parent has to
  	 * be deleted too, and the same condition applies recursively to it. We
! 	 * have to check this condition all the way up before trying to delete. We
! 	 * don't need to re-test when deleting a non-leaf page, though.
  	 */
! 	if (targetlevel == 0 &&
! 		!_bt_parent_deletion_safe(rel, target, stack))
! 		return 0;
  
  	/*
  	 * We have to lock the pages we need to modify in the standard order:
  	 * moving right, then up.  Else we will deadlock against other writers.
  	 *
! 	 * So, we need to find and write-lock the current left sibling of the
  	 * target page.  The sibling that was current a moment ago could have
  	 * split, so we may have to move right.  This search could fail if either
  	 * the sibling or the target page was deleted by someone else meanwhile;
--- 1232,1474 ----
  	 */
  	if (stack == NULL)
  	{
! 		ScanKey itup_scankey;
! 		Buffer lbuf;
! 
! 		/* we need an insertion scan key to do our search, so build one */
! 		itup_scankey = _bt_mkscankey(rel, targetkey);
! 		/* find the leftmost leaf page containing this key */
! 		stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
! 						   &lbuf, BT_READ);
! 		/* don't need a pin on that either */
! 		_bt_relbuf(rel, lbuf);
! 	}
  
! 	/*
! 	 * Re-acquire lock on the leaf page, and check that it can still be
! 	 * deleted.
! 	 */
! 	LockBuffer(leafbuf, BT_WRITE);
! 
! 	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
! 		P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque) ||
! 		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
! 	{
! 		return false;
  	}
  
  	/*
+ 	 * Save info about page, including a copy of its high key (it must have
+ 	 * one, being non-rightmost).
+ 	 */
+ 	leafrightsib = opaque->btpo_next;
+ 
+ 	/*
  	 * We cannot delete a page that is the rightmost child of its immediate
  	 * parent, unless it is the only child --- in which case the parent has to
  	 * be deleted too, and the same condition applies recursively to it. We
! 	 * have to check this condition all the way up before trying to delete,
! 	 * and lock the final parent of the to-be-deleted branch.
  	 */
! 	rightsib = leafrightsib;
! 	target = leafblkno;
! 	if (!_bt_lock_branch_parent(rel, leafblkno, stack,
! 								&topparent, &topoff, &target, &rightsib))
! 		return false;
! 
! 	/*
! 	 * Check that the parent-page index items we're about to delete/overwrite
! 	 * contain what we expect.	This can fail if the index has become corrupt
! 	 * for some reason.  We want to throw any error before entering the
! 	 * critical section --- otherwise it'd be a PANIC.
! 	 *
! 	 * The test on the target item is just an Assert because
! 	 * _bt_lock_branch_parent should have guaranteed it has the expected
! 	 * contents.  The test on the next-child downlink is known to sometimes
! 	 * fail in the field, though.
! 	 */
! 	page = BufferGetPage(topparent);
! 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 
! #ifdef USE_ASSERT_CHECKING
! 	itemid = PageGetItemId(page, topoff);
! 	itup = (IndexTuple) PageGetItem(page, itemid);
! 	Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
! #endif
! 
! 	nextoffset = OffsetNumberNext(topoff);
! 	itemid = PageGetItemId(page, nextoffset);
! 	itup = (IndexTuple) PageGetItem(page, itemid);
! 	if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
! 		elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
! 			 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
! 			 BufferGetBlockNumber(topparent), RelationGetRelationName(rel));
! 
! 	/*
! 	 * Any insert which would have gone on the target block will now go to the
! 	 * right sibling block.
! 	 */
! 	PredicateLockPageCombine(rel, leafblkno, leafrightsib);
! 
! 	/* No ereport(ERROR) until changes are logged */
! 	START_CRIT_SECTION();
! 
! 	/*
! 	 * Update parent.  The normal case is a tad tricky because we want to
! 	 * delete the target's downlink and the *following* key.  Easiest way is
! 	 * to copy the right sibling's downlink over the target downlink, and then
! 	 * delete the following item.
! 	 */
! 	page = BufferGetPage(topparent);
! 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 
! 	itemid = PageGetItemId(page, topoff);
! 	itup = (IndexTuple) PageGetItem(page, itemid);
! 	ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
! 
! 	nextoffset = OffsetNumberNext(topoff);
! 	PageIndexTupleDelete(page, nextoffset);
! 
! 	/*
! 	 * Mark the leaf page as half-dead, and stamp it with a pointer to the
! 	 * highest internal page in the branch we're deleting. We use the tid of
! 	 * the high key to store it.
! 	 */
! 	page = BufferGetPage(leafbuf);
! 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 	opaque->btpo_flags |= BTP_HALF_DEAD;
! 
! 	itemid = PageGetItemId(page, P_HIKEY);
! 	itup = (IndexTuple) PageGetItem(page, itemid);
! 	if (target == leafblkno)
! 		ItemPointerSetInvalid(&(itup->t_tid));
! 	else
! 		ItemPointerSet(&(itup->t_tid), target, P_HIKEY);
! 
! 	/* Must mark buffers dirty before XLogInsert */
! 	MarkBufferDirty(topparent);
! 	MarkBufferDirty(leafbuf);
! 
! 	/* XLOG stuff */
! 	if (RelationNeedsWAL(rel))
! 	{
! 		xl_btree_mark_page_halfdead xlrec;
! 		XLogRecPtr	recptr;
! 		XLogRecData rdata[2];
! 
! 		xlrec.target.node = rel->rd_node;
! 		ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(topparent), topoff);
! 		xlrec.leafblk = leafblkno;
! 		xlrec.downlink = target;
! 
! 		page = BufferGetPage(leafbuf);
! 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 		xlrec.leftblk = opaque->btpo_prev;
! 		xlrec.rightblk = opaque->btpo_next;
! 
! 		rdata[0].data = (char *) &xlrec;
! 		rdata[0].len = SizeOfBtreeMarkPageHalfDead;
! 		rdata[0].buffer = InvalidBuffer;
! 		rdata[0].next = &(rdata[1]);
! 
! 		rdata[1].data = NULL;
! 		rdata[1].len = 0;
! 		rdata[1].buffer = topparent;
! 		rdata[1].buffer_std = true;
! 		rdata[1].next = NULL;
! 
! 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD, rdata);
! 
! 		page = BufferGetPage(topparent);
! 		PageSetLSN(page, recptr);
! 		page = BufferGetPage(leafbuf);
! 		PageSetLSN(page, recptr);
! 	}
! 
! 	END_CRIT_SECTION();
! 
! 	_bt_relbuf(rel, topparent);
! 	return true;
! }
! 
! /*
!  * Unlink a page in a branch of half-dead pages from its siblings.
!  *
!  * If the leaf page still has a downlink pointing to it, unlinks the highest
!  * parent in the to-be-deleted branch instead, and returns false. To get rid
!  * of the whole branch, including the leaf page itself, iterate until returns
!  * true.
!  */
! static bool
! _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
! {
! 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
! 	BlockNumber	target;
! 	BlockNumber	leftsib;
! 	BlockNumber	rightsib;
! 	Buffer		lbuf = InvalidBuffer;
! 	Buffer		buf;
! 	Buffer		rbuf;
! 	Buffer		metabuf = InvalidBuffer;
! 	Page		metapg;
! 	BTMetaPageData *metad;
! 	ItemId		itemid;
! 	Page		page;
! 	BTPageOpaque opaque;
! 	bool		rightsib_is_rightmost;
! 	int			targetlevel;
! 	ItemPointer leafhikey;
! 	BlockNumber nextchild;
! 	BlockNumber	topblkno;
! 
! 	page = BufferGetPage(leafbuf);
! 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 
! 	Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
! 
! 	/*
! 	 * If the leaf page still has a parent pointing to it (or a chain of
! 	 * parents), we don't unlink the leaf page yet, but the topmost remaining
! 	 * parent in the branch.
! 	 */
! 	itemid = PageGetItemId(page, P_HIKEY);
! 	leafhikey = &((IndexTuple) PageGetItem(page, itemid))->t_tid;
! 
! 	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
! 
! 	if (ItemPointerIsValid(leafhikey))
! 	{
! 		topblkno = ItemPointerGetBlockNumber(leafhikey);
! 		target = topblkno;
! 
! 		buf = _bt_getbuf(rel, topblkno, BT_READ);
! 		page = BufferGetPage(buf);
! 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 		leftsib = opaque->btpo_prev;
! 		targetlevel = opaque->btpo.level;
! 
! 		/*
! 		 * To avoid deadlocks, we'd better drop the target page lock before
! 		 * going further.
! 		 */
! 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
! 	}
! 	else
! 	{
! 		topblkno = InvalidBlockNumber;
! 		target = leafblkno;
! 		buf = leafbuf;
! 		targetlevel = 0;
! 	}
! 
! 	leftsib = opaque->btpo_prev;
  
  	/*
  	 * We have to lock the pages we need to modify in the standard order:
  	 * moving right, then up.  Else we will deadlock against other writers.
  	 *
! 	 * So, first lock the leaf page, if it's not the target.
! 	 * Then find and write-lock the current left sibling of the
  	 * target page.  The sibling that was current a moment ago could have
  	 * split, so we may have to move right.  This search could fail if either
  	 * the sibling or the target page was deleted by someone else meanwhile;
***************
*** 1203,1208 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
--- 1476,1483 ----
  	 * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
  	 * concurrently on the same table.)
  	 */
+ 	if (target != leafblkno)
+ 		LockBuffer(leafbuf, BT_WRITE);
  	if (leftsib != P_NONE)
  	{
  		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
***************
*** 1217,1223 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  			{
  				elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
  					 RelationGetRelationName(rel));
! 				return 0;
  			}
  			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
  			page = BufferGetPage(lbuf);
--- 1492,1498 ----
  			{
  				elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
  					 RelationGetRelationName(rel));
! 				return false;
  			}
  			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
  			page = BufferGetPage(lbuf);
***************
*** 1232,1258 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	 * a write lock not a superexclusive lock, since no scans would stop on an
  	 * empty page.
  	 */
! 	buf = _bt_getbuf(rel, target, BT_WRITE);
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
  	/*
! 	 * Check page is still empty etc, else abandon deletion.  The empty check
! 	 * is necessary since someone else might have inserted into it while we
! 	 * didn't have it locked; the others are just for paranoia's sake.
  	 */
! 	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
! 		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
  	{
! 		_bt_relbuf(rel, buf);
! 		if (BufferIsValid(lbuf))
! 			_bt_relbuf(rel, lbuf);
! 		return 0;
  	}
  	if (opaque->btpo_prev != leftsib)
  		elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
  			 target, RelationGetRelationName(rel));
  
  	/*
  	 * And next write-lock the (current) right sibling.
  	 */
--- 1507,1550 ----
  	 * a write lock not a superexclusive lock, since no scans would stop on an
  	 * empty page.
  	 */
! 	LockBuffer(buf, BT_WRITE);
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
  	/*
! 	 * Check page is still empty etc, else abandon deletion.  This is just
! 	 * for paranoia's sake; a half-dead page cannot resurrect because there
! 	 * can be only one vacuum process running at a time.
  	 */
! 	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
  	{
! 		elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
! 			 target, RelationGetRelationName(rel));
  	}
  	if (opaque->btpo_prev != leftsib)
  		elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
  			 target, RelationGetRelationName(rel));
  
+ 	if (target == leafblkno)
+ 	{
+ 		if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+ 			!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
+ 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
+ 				 target, RelationGetRelationName(rel));
+ 		nextchild = InvalidBlockNumber;
+ 	}
+ 	else
+ 	{
+ 		if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
+ 			P_ISLEAF(opaque))
+ 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
+ 				 target, RelationGetRelationName(rel));
+ 
+ 		/* remember the next child down in the branch. */
+ 		itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
+ 		nextchild = ItemPointerGetBlockNumber(&((IndexTuple) PageGetItem(page, itemid))->t_tid);
+ 	}
+ 
  	/*
  	 * And next write-lock the (current) right sibling.
  	 */
***************
*** 1265,1314 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  			 "block %u links to %u instead of expected %u in index \"%s\"",
  			 rightsib, opaque->btpo_prev, target,
  			 RelationGetRelationName(rel));
! 
! 	/*
! 	 * Any insert which would have gone on the target block will now go to the
! 	 * right sibling block.
! 	 */
! 	PredicateLockPageCombine(rel, target, rightsib);
! 
! 	/*
! 	 * Next find and write-lock the current parent of the target page. This is
! 	 * essentially the same as the corresponding step of splitting.
! 	 */
! 	ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
! 	pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
! 	if (pbuf == InvalidBuffer)
! 		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
! 			 RelationGetRelationName(rel), target);
! 	parent = stack->bts_blkno;
! 	poffset = stack->bts_offset;
! 
! 	/*
! 	 * If the target is the rightmost child of its parent, then we can't
! 	 * delete, unless it's also the only child --- in which case the parent
! 	 * changes to half-dead status.  The "can't delete" case should have been
! 	 * detected by _bt_parent_deletion_safe, so complain if we see it now.
! 	 */
! 	page = BufferGetPage(pbuf);
! 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
! 	maxoff = PageGetMaxOffsetNumber(page);
! 	parent_half_dead = false;
! 	parent_one_child = false;
! 	if (poffset >= maxoff)
! 	{
! 		if (poffset == P_FIRSTDATAKEY(opaque))
! 			parent_half_dead = true;
! 		else
! 			elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
! 				 target, parent, RelationGetRelationName(rel));
! 	}
! 	else
! 	{
! 		/* Will there be exactly one child left in this parent? */
! 		if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
! 			parent_one_child = true;
! 	}
  
  	/*
  	 * If we are deleting the next-to-last page on the target's level, then
--- 1557,1564 ----
  			 "block %u links to %u instead of expected %u in index \"%s\"",
  			 rightsib, opaque->btpo_prev, target,
  			 RelationGetRelationName(rel));
! 	rightsib_is_rightmost = P_RIGHTMOST(opaque);
! 	*rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
  
  	/*
  	 * If we are deleting the next-to-last page on the target's level, then
***************
*** 1322,1332 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	 * We can safely acquire a lock on the metapage here --- see comments for
  	 * _bt_newroot().
  	 */
! 	if (leftsib == P_NONE && !parent_half_dead)
  	{
  		page = BufferGetPage(rbuf);
  		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- 		Assert(opaque->btpo.level == targetlevel);
  		if (P_RIGHTMOST(opaque))
  		{
  			/* rightsib will be the only one left on the level */
--- 1572,1581 ----
  	 * We can safely acquire a lock on the metapage here --- see comments for
  	 * _bt_newroot().
  	 */
! 	if (leftsib == P_NONE && rightsib_is_rightmost)
  	{
  		page = BufferGetPage(rbuf);
  		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  		if (P_RIGHTMOST(opaque))
  		{
  			/* rightsib will be the only one left on the level */
***************
*** 1349,1386 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	}
  
  	/*
- 	 * Check that the parent-page index items we're about to delete/overwrite
- 	 * contain what we expect.	This can fail if the index has become corrupt
- 	 * for some reason.  We want to throw any error before entering the
- 	 * critical section --- otherwise it'd be a PANIC.
- 	 *
- 	 * The test on the target item is just an Assert because _bt_getstackbuf
- 	 * should have guaranteed it has the expected contents.  The test on the
- 	 * next-child downlink is known to sometimes fail in the field, though.
- 	 */
- 	page = BufferGetPage(pbuf);
- 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- 
- #ifdef USE_ASSERT_CHECKING
- 	itemid = PageGetItemId(page, poffset);
- 	itup = (IndexTuple) PageGetItem(page, itemid);
- 	Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
- #endif
- 
- 	if (!parent_half_dead)
- 	{
- 		OffsetNumber nextoffset;
- 
- 		nextoffset = OffsetNumberNext(poffset);
- 		itemid = PageGetItemId(page, nextoffset);
- 		itup = (IndexTuple) PageGetItem(page, itemid);
- 		if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
- 			elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
- 				 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
- 				 parent, RelationGetRelationName(rel));
- 	}
- 
- 	/*
  	 * Here we begin doing the deletion.
  	 */
  
--- 1598,1603 ----
***************
*** 1388,1416 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	START_CRIT_SECTION();
  
  	/*
- 	 * Update parent.  The normal case is a tad tricky because we want to
- 	 * delete the target's downlink and the *following* key.  Easiest way is
- 	 * to copy the right sibling's downlink over the target downlink, and then
- 	 * delete the following item.
- 	 */
- 	if (parent_half_dead)
- 	{
- 		PageIndexTupleDelete(page, poffset);
- 		opaque->btpo_flags |= BTP_HALF_DEAD;
- 	}
- 	else
- 	{
- 		OffsetNumber nextoffset;
- 
- 		itemid = PageGetItemId(page, poffset);
- 		itup = (IndexTuple) PageGetItem(page, itemid);
- 		ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
- 
- 		nextoffset = OffsetNumberNext(poffset);
- 		PageIndexTupleDelete(page, nextoffset);
- 	}
- 
- 	/*
  	 * Update siblings' side-links.  Note the target page's side-links will
  	 * continue to point to the siblings.  Asserts here are just rechecking
  	 * things we already verified above.
--- 1605,1610 ----
***************
*** 1426,1432 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	Assert(opaque->btpo_prev == target);
  	opaque->btpo_prev = leftsib;
! 	rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
  
  	/*
  	 * Mark the page itself deleted.  It can be recycled when all current
--- 1620,1640 ----
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	Assert(opaque->btpo_prev == target);
  	opaque->btpo_prev = leftsib;
! 
! 	/*
! 	 * If we deleted a parent of the targeted leaf page, instead of the leaf
! 	 * itself, update the leaf to point to the next remaining child in the
! 	 * branch.
! 	 */
! 	if (target != leafblkno)
! 	{
! 		if (nextchild == leafblkno)
! 			ItemPointerSetInvalid(leafhikey);
! 		else
! 			ItemPointerSet(leafhikey, nextchild, P_HIKEY);
! 
! 		MarkBufferDirty(leafbuf);
! 	}
  
  	/*
  	 * Mark the page itself deleted.  It can be recycled when all current
***************
*** 1453,1459 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	}
  
  	/* Must mark buffers dirty before XLogInsert */
- 	MarkBufferDirty(pbuf);
  	MarkBufferDirty(rbuf);
  	MarkBufferDirty(buf);
  	if (BufferIsValid(lbuf))
--- 1661,1666 ----
***************
*** 1462,1483 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  	/* XLOG stuff */
  	if (RelationNeedsWAL(rel))
  	{
! 		xl_btree_delete_page xlrec;
  		xl_btree_metadata xlmeta;
  		uint8		xlinfo;
  		XLogRecPtr	recptr;
! 		XLogRecData rdata[5];
  		XLogRecData *nextrdata;
  
! 		xlrec.target.node = rel->rd_node;
! 		ItemPointerSet(&(xlrec.target.tid), parent, poffset);
  		xlrec.deadblk = target;
  		xlrec.leftblk = leftsib;
  		xlrec.rightblk = rightsib;
  		xlrec.btpo_xact = opaque->btpo.xact;
  
  		rdata[0].data = (char *) &xlrec;
! 		rdata[0].len = SizeOfBtreeDeletePage;
  		rdata[0].buffer = InvalidBuffer;
  		rdata[0].next = nextrdata = &(rdata[1]);
  
--- 1669,1691 ----
  	/* XLOG stuff */
  	if (RelationNeedsWAL(rel))
  	{
! 		xl_btree_unlink_page xlrec;
  		xl_btree_metadata xlmeta;
  		uint8		xlinfo;
  		XLogRecPtr	recptr;
! 		XLogRecData rdata[4];
  		XLogRecData *nextrdata;
  
! 		xlrec.node = rel->rd_node;
  		xlrec.deadblk = target;
  		xlrec.leftblk = leftsib;
  		xlrec.rightblk = rightsib;
+ 		xlrec.leafblk = leafblkno;
+ 		xlrec.downlink = nextchild;
  		xlrec.btpo_xact = opaque->btpo.xact;
  
  		rdata[0].data = (char *) &xlrec;
! 		rdata[0].len = SizeOfBtreeUnlinkPage;
  		rdata[0].buffer = InvalidBuffer;
  		rdata[0].next = nextrdata = &(rdata[1]);
  
***************
*** 1493,1511 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  			nextrdata->buffer = InvalidBuffer;
  			nextrdata->next = nextrdata + 1;
  			nextrdata++;
! 			xlinfo = XLOG_BTREE_DELETE_PAGE_META;
  		}
- 		else if (parent_half_dead)
- 			xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
  		else
! 			xlinfo = XLOG_BTREE_DELETE_PAGE;
! 
! 		nextrdata->data = NULL;
! 		nextrdata->len = 0;
! 		nextrdata->next = nextrdata + 1;
! 		nextrdata->buffer = pbuf;
! 		nextrdata->buffer_std = true;
! 		nextrdata++;
  
  		nextrdata->data = NULL;
  		nextrdata->len = 0;
--- 1701,1710 ----
  			nextrdata->buffer = InvalidBuffer;
  			nextrdata->next = nextrdata + 1;
  			nextrdata++;
! 			xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
  		}
  		else
! 			xlinfo = XLOG_BTREE_UNLINK_PAGE;
  
  		nextrdata->data = NULL;
  		nextrdata->len = 0;
***************
*** 1530,1537 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  		{
  			PageSetLSN(metapg, recptr);
  		}
- 		page = BufferGetPage(pbuf);
- 		PageSetLSN(page, recptr);
  		page = BufferGetPage(rbuf);
  		PageSetLSN(page, recptr);
  		page = BufferGetPage(buf);
--- 1729,1734 ----
***************
*** 1551,1596 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
  		CacheInvalidateRelcache(rel);
  		_bt_relbuf(rel, metabuf);
  	}
! 	/* can always release leftsib immediately */
  	if (BufferIsValid(lbuf))
  		_bt_relbuf(rel, lbuf);
  
! 	/*
! 	 * If parent became half dead, recurse to delete it. Otherwise, if right
! 	 * sibling is empty and is now the last child of the parent, recurse to
! 	 * try to delete it.  (These cases cannot apply at the same time, though
! 	 * the second case might itself recurse to the first.)
! 	 *
! 	 * When recursing to parent, we hold the lock on the target page until
! 	 * done.  This delays any insertions into the keyspace that was just
! 	 * effectively reassigned to the parent's right sibling.  If we allowed
! 	 * that, and there were enough such insertions before we finish deleting
! 	 * the parent, page splits within that keyspace could lead to inserting
! 	 * out-of-order keys into the grandparent level.  It is thought that that
! 	 * wouldn't have any serious consequences, but it still seems like a
! 	 * pretty bad idea.
! 	 */
! 	if (parent_half_dead)
! 	{
! 		/* recursive call will release pbuf */
! 		_bt_relbuf(rel, rbuf);
! 		result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
  		_bt_relbuf(rel, buf);
- 	}
- 	else if (parent_one_child && rightsib_empty)
- 	{
- 		_bt_relbuf(rel, pbuf);
- 		_bt_relbuf(rel, buf);
- 		/* recursive call will release rbuf */
- 		result = _bt_pagedel(rel, rbuf, stack) + 1;
- 	}
- 	else
- 	{
- 		_bt_relbuf(rel, pbuf);
- 		_bt_relbuf(rel, buf);
- 		_bt_relbuf(rel, rbuf);
- 		result = 1;
- 	}
  
! 	return result;
  }
--- 1748,1760 ----
  		CacheInvalidateRelcache(rel);
  		_bt_relbuf(rel, metabuf);
  	}
! 	/* release siblings */
  	if (BufferIsValid(lbuf))
  		_bt_relbuf(rel, lbuf);
+ 	_bt_relbuf(rel, rbuf);
  
! 	if (target != leafblkno)
  		_bt_relbuf(rel, buf);
  
! 	return true;
  }
*** a/src/backend/access/nbtree/nbtxlog.c
--- b/src/backend/access/nbtree/nbtxlog.c
***************
*** 25,65 ****
   * them manually if they are not seen in the WAL log during replay.  This
   * makes it safe for page insertion to be a multiple-WAL-action process.
   *
-  * Similarly, deletion of an only child page and deletion of its parent page
-  * form multiple WAL log entries, and we have to be prepared to follow through
-  * with the deletion if the log ends between.
-  *
   * The data structure is a simple linked list --- this should be good enough,
!  * since we don't expect a page split or multi deletion to remain incomplete
!  * for long.  In any case we need to respect the order of operations.
   */
! typedef struct bt_incomplete_action
  {
  	RelFileNode node;			/* the index */
- 	bool		is_split;		/* T = pending split, F = pending delete */
- 	/* these fields are for a split: */
  	bool		is_root;		/* we split the root */
  	BlockNumber leftblk;		/* left half of split */
  	BlockNumber rightblk;		/* right half of split */
! 	/* these fields are for a delete: */
! 	BlockNumber delblk;			/* parent block to be deleted */
! } bt_incomplete_action;
  
! static List *incomplete_actions;
  
  
  static void
  log_incomplete_split(RelFileNode node, BlockNumber leftblk,
  					 BlockNumber rightblk, bool is_root)
  {
! 	bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
  
  	action->node = node;
- 	action->is_split = true;
  	action->is_root = is_root;
  	action->leftblk = leftblk;
  	action->rightblk = rightblk;
! 	incomplete_actions = lappend(incomplete_actions, action);
  }
  
  static void
--- 25,56 ----
   * them manually if they are not seen in the WAL log during replay.  This
   * makes it safe for page insertion to be a multiple-WAL-action process.
   *
   * The data structure is a simple linked list --- this should be good enough,
!  * since we don't expect a page split to remain incomplete for long.  In any
!  * case we need to respect the order of operations.
   */
! typedef struct bt_incomplete_split
  {
  	RelFileNode node;			/* the index */
  	bool		is_root;		/* we split the root */
  	BlockNumber leftblk;		/* left half of split */
  	BlockNumber rightblk;		/* right half of split */
! } bt_incomplete_split;
  
! static List *incomplete_splits;
  
  
  static void
  log_incomplete_split(RelFileNode node, BlockNumber leftblk,
  					 BlockNumber rightblk, bool is_root)
  {
! 	bt_incomplete_split *action = palloc(sizeof(bt_incomplete_split));
  
  	action->node = node;
  	action->is_root = is_root;
  	action->leftblk = leftblk;
  	action->rightblk = rightblk;
! 	incomplete_splits = lappend(incomplete_splits, action);
  }
  
  static void
***************
*** 67,115 **** forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root)
  {
  	ListCell   *l;
  
! 	foreach(l, incomplete_actions)
  	{
! 		bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
  
  		if (RelFileNodeEquals(node, action->node) &&
- 			action->is_split &&
  			downlink == action->rightblk)
  		{
  			if (is_root != action->is_root)
  				elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
  					 action->is_root, is_root);
! 			incomplete_actions = list_delete_ptr(incomplete_actions, action);
! 			pfree(action);
! 			break;				/* need not look further */
! 		}
! 	}
! }
! 
! static void
! log_incomplete_deletion(RelFileNode node, BlockNumber delblk)
! {
! 	bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
! 
! 	action->node = node;
! 	action->is_split = false;
! 	action->delblk = delblk;
! 	incomplete_actions = lappend(incomplete_actions, action);
! }
! 
! static void
! forget_matching_deletion(RelFileNode node, BlockNumber delblk)
! {
! 	ListCell   *l;
! 
! 	foreach(l, incomplete_actions)
! 	{
! 		bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
! 
! 		if (RelFileNodeEquals(node, action->node) &&
! 			!action->is_split &&
! 			delblk == action->delblk)
! 		{
! 			incomplete_actions = list_delete_ptr(incomplete_actions, action);
  			pfree(action);
  			break;				/* need not look further */
  		}
--- 58,74 ----
  {
  	ListCell   *l;
  
! 	foreach(l, incomplete_splits)
  	{
! 		bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
  
  		if (RelFileNodeEquals(node, action->node) &&
  			downlink == action->rightblk)
  		{
  			if (is_root != action->is_root)
  				elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
  					 action->is_root, is_root);
! 			incomplete_splits = list_delete_ptr(incomplete_splits, action);
  			pfree(action);
  			break;				/* need not look further */
  		}
***************
*** 779,799 **** btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
  }
  
  static void
! btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  {
! 	xl_btree_delete_page *xlrec = (xl_btree_delete_page *) XLogRecGetData(record);
  	BlockNumber parent;
- 	BlockNumber target;
- 	BlockNumber leftsib;
- 	BlockNumber rightsib;
  	Buffer		buffer;
  	Page		page;
  	BTPageOpaque pageop;
  
  	parent = ItemPointerGetBlockNumber(&(xlrec->target.tid));
- 	target = xlrec->deadblk;
- 	leftsib = xlrec->leftblk;
- 	rightsib = xlrec->rightblk;
  
  	/*
  	 * In normal operation, we would lock all the pages this WAL record
--- 738,753 ----
  }
  
  static void
! btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  {
! 	xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
  	BlockNumber parent;
  	Buffer		buffer;
  	Page		page;
  	BTPageOpaque pageop;
+ 	IndexTupleData trunctuple;
  
  	parent = ItemPointerGetBlockNumber(&(xlrec->target.tid));
  
  	/*
  	 * In normal operation, we would lock all the pages this WAL record
***************
*** 813,861 **** btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  		{
  			page = (Page) BufferGetPage(buffer);
  			pageop = (BTPageOpaque) PageGetSpecialPointer(page);
! 			if (lsn <= PageGetLSN(page))
! 			{
! 				UnlockReleaseBuffer(buffer);
! 			}
! 			else
  			{
  				OffsetNumber poffset;
  
  				poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
! 				if (poffset >= PageGetMaxOffsetNumber(page))
! 				{
! 					Assert(info == XLOG_BTREE_DELETE_PAGE_HALF);
! 					Assert(poffset == P_FIRSTDATAKEY(pageop));
! 					PageIndexTupleDelete(page, poffset);
! 					pageop->btpo_flags |= BTP_HALF_DEAD;
! 				}
! 				else
! 				{
! 					ItemId		itemid;
! 					IndexTuple	itup;
! 					OffsetNumber nextoffset;
! 
! 					Assert(info != XLOG_BTREE_DELETE_PAGE_HALF);
! 					itemid = PageGetItemId(page, poffset);
! 					itup = (IndexTuple) PageGetItem(page, itemid);
! 					ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
! 					nextoffset = OffsetNumberNext(poffset);
! 					PageIndexTupleDelete(page, nextoffset);
! 				}
  
  				PageSetLSN(page, lsn);
  				MarkBufferDirty(buffer);
- 				UnlockReleaseBuffer(buffer);
  			}
  		}
  	}
  
  	/* Fix left-link of right sibling */
! 	if (record->xl_info & XLR_BKP_BLOCK(1))
! 		(void) RestoreBackupBlock(lsn, record, 1, false, false);
  	else
  	{
! 		buffer = XLogReadBuffer(xlrec->target.node, rightsib, false);
  		if (BufferIsValid(buffer))
  		{
  			page = (Page) BufferGetPage(buffer);
--- 767,856 ----
  		{
  			page = (Page) BufferGetPage(buffer);
  			pageop = (BTPageOpaque) PageGetSpecialPointer(page);
! 			if (lsn > PageGetLSN(page))
  			{
  				OffsetNumber poffset;
+ 				ItemId		itemid;
+ 				IndexTuple	itup;
+ 				OffsetNumber nextoffset;
+ 				BlockNumber	rightsib;
  
  				poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
! 
! 				nextoffset = OffsetNumberNext(poffset);
! 				itemid = PageGetItemId(page, nextoffset);
! 				itup = (IndexTuple) PageGetItem(page, itemid);
! 				rightsib = ItemPointerGetBlockNumber(&itup->t_tid);
! 
! 				itemid = PageGetItemId(page, poffset);
! 				itup = (IndexTuple) PageGetItem(page, itemid);
! 				ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
! 				nextoffset = OffsetNumberNext(poffset);
! 				PageIndexTupleDelete(page, nextoffset);
  
  				PageSetLSN(page, lsn);
  				MarkBufferDirty(buffer);
  			}
+ 			UnlockReleaseBuffer(buffer);
  		}
  	}
  
+ 	/* Rewrite the leaf page as a halfdead page */
+ 	buffer = XLogReadBuffer(xlrec->target.node, xlrec->leafblk, true);
+ 	Assert(BufferIsValid(buffer));
+ 	page = (Page) BufferGetPage(buffer);
+ 
+ 	_bt_pageinit(page, BufferGetPageSize(buffer));
+ 	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ 
+ 	pageop->btpo_prev = xlrec->leftblk;
+ 	pageop->btpo_next = xlrec->rightblk;
+ 	pageop->btpo.level = 0;
+ 	pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
+ 	pageop->btpo_cycleid = 0;
+ 
+ 	/* a dummy hikey that points to the next parent to be deleted (if any) */
+ 	trunctuple.t_info = sizeof(IndexTupleData);
+ 	ItemPointerSet(&trunctuple.t_tid, xlrec->downlink, P_HIKEY);
+ 	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
+ 					false, false) == InvalidOffsetNumber)
+ 		elog(ERROR, "could not add dummy high key to half-dead page");
+ 
+ 	PageSetLSN(page, lsn);
+ 	MarkBufferDirty(buffer);
+ 	UnlockReleaseBuffer(buffer);
+ }
+ 
+ 
+ static void
+ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
+ 	BlockNumber target;
+ 	BlockNumber leftsib;
+ 	BlockNumber rightsib;
+ 	Buffer		buffer;
+ 	Page		page;
+ 	BTPageOpaque pageop;
+ 
+ 	target = xlrec->deadblk;
+ 	leftsib = xlrec->leftblk;
+ 	rightsib = xlrec->rightblk;
+ 
+ 	/*
+ 	 * In normal operation, we would lock all the pages this WAL record
+ 	 * touches before changing any of them.  In WAL replay, it should be okay
+ 	 * to lock just one page at a time, since no concurrent index updates can
+ 	 * be happening, and readers should not care whether they arrive at the
+ 	 * target page or not (since it's surely empty).
+ 	 */
+ 
  	/* Fix left-link of right sibling */
! 	if (record->xl_info & XLR_BKP_BLOCK(0))
! 		(void) RestoreBackupBlock(lsn, record, 0, false, false);
  	else
  	{
! 		buffer = XLogReadBuffer(xlrec->node, rightsib, false);
  		if (BufferIsValid(buffer))
  		{
  			page = (Page) BufferGetPage(buffer);
***************
*** 876,888 **** btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  	}
  
  	/* Fix right-link of left sibling, if any */
! 	if (record->xl_info & XLR_BKP_BLOCK(2))
! 		(void) RestoreBackupBlock(lsn, record, 2, false, false);
  	else
  	{
  		if (leftsib != P_NONE)
  		{
! 			buffer = XLogReadBuffer(xlrec->target.node, leftsib, false);
  			if (BufferIsValid(buffer))
  			{
  				page = (Page) BufferGetPage(buffer);
--- 871,883 ----
  	}
  
  	/* Fix right-link of left sibling, if any */
! 	if (record->xl_info & XLR_BKP_BLOCK(1))
! 		(void) RestoreBackupBlock(lsn, record, 1, false, false);
  	else
  	{
  		if (leftsib != P_NONE)
  		{
! 			buffer = XLogReadBuffer(xlrec->node, leftsib, false);
  			if (BufferIsValid(buffer))
  			{
  				page = (Page) BufferGetPage(buffer);
***************
*** 904,910 **** btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  	}
  
  	/* Rewrite target page as empty deleted page */
! 	buffer = XLogReadBuffer(xlrec->target.node, target, true);
  	Assert(BufferIsValid(buffer));
  	page = (Page) BufferGetPage(buffer);
  
--- 899,905 ----
  	}
  
  	/* Rewrite target page as empty deleted page */
! 	buffer = XLogReadBuffer(xlrec->node, target, true);
  	Assert(BufferIsValid(buffer));
  	page = (Page) BufferGetPage(buffer);
  
***************
*** 921,944 **** btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
  	MarkBufferDirty(buffer);
  	UnlockReleaseBuffer(buffer);
  
  	/* Update metapage if needed */
! 	if (info == XLOG_BTREE_DELETE_PAGE_META)
  	{
  		xl_btree_metadata md;
  
! 		memcpy(&md, (char *) xlrec + SizeOfBtreeDeletePage,
  			   sizeof(xl_btree_metadata));
! 		_bt_restore_meta(xlrec->target.node, lsn,
  						 md.root, md.level,
  						 md.fastroot, md.fastlevel);
  	}
- 
- 	/* Forget any completed deletion */
- 	forget_matching_deletion(xlrec->target.node, target);
- 
- 	/* If parent became half-dead, remember it for deletion */
- 	if (info == XLOG_BTREE_DELETE_PAGE_HALF)
- 		log_incomplete_deletion(xlrec->target.node, parent);
  }
  
  static void
--- 916,934 ----
  	MarkBufferDirty(buffer);
  	UnlockReleaseBuffer(buffer);
  
+ 	/* Update leaf block */
+ 
  	/* Update metapage if needed */
! 	if (info == XLOG_BTREE_UNLINK_PAGE_META)
  	{
  		xl_btree_metadata md;
  
! 		memcpy(&md, (char *) xlrec + SizeOfBtreeUnlinkPage,
  			   sizeof(xl_btree_metadata));
! 		_bt_restore_meta(xlrec->node, lsn,
  						 md.root, md.level,
  						 md.fastroot, md.fastlevel);
  	}
  }
  
  static void
***************
*** 1053,1062 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
  		case XLOG_BTREE_DELETE:
  			btree_xlog_delete(lsn, record);
  			break;
! 		case XLOG_BTREE_DELETE_PAGE:
! 		case XLOG_BTREE_DELETE_PAGE_META:
! 		case XLOG_BTREE_DELETE_PAGE_HALF:
! 			btree_xlog_delete_page(info, lsn, record);
  			break;
  		case XLOG_BTREE_NEWROOT:
  			btree_xlog_newroot(lsn, record);
--- 1043,1054 ----
  		case XLOG_BTREE_DELETE:
  			btree_xlog_delete(lsn, record);
  			break;
! 		case XLOG_BTREE_MARK_PAGE_HALFDEAD:
! 			btree_xlog_mark_page_halfdead(info, lsn, record);
! 			break;
! 		case XLOG_BTREE_UNLINK_PAGE:
! 		case XLOG_BTREE_UNLINK_PAGE_META:
! 			btree_xlog_unlink_page(info, lsn, record);
  			break;
  		case XLOG_BTREE_NEWROOT:
  			btree_xlog_newroot(lsn, record);
***************
*** 1072,1078 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
  void
  btree_xlog_startup(void)
  {
! 	incomplete_actions = NIL;
  }
  
  void
--- 1064,1070 ----
  void
  btree_xlog_startup(void)
  {
! 	incomplete_splits = NIL;
  }
  
  void
***************
*** 1080,1146 **** btree_xlog_cleanup(void)
  {
  	ListCell   *l;
  
! 	foreach(l, incomplete_actions)
  	{
! 		bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
! 
! 		if (action->is_split)
! 		{
! 			/* finish an incomplete split */
! 			Buffer		lbuf,
! 						rbuf;
! 			Page		lpage,
! 						rpage;
! 			BTPageOpaque lpageop,
! 						rpageop;
! 			bool		is_only;
! 			Relation	reln;
! 
! 			lbuf = XLogReadBuffer(action->node, action->leftblk, false);
! 			/* failure is impossible because we wrote this page earlier */
! 			if (!BufferIsValid(lbuf))
! 				elog(PANIC, "btree_xlog_cleanup: left block unfound");
! 			lpage = (Page) BufferGetPage(lbuf);
! 			lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
! 			rbuf = XLogReadBuffer(action->node, action->rightblk, false);
! 			/* failure is impossible because we wrote this page earlier */
! 			if (!BufferIsValid(rbuf))
! 				elog(PANIC, "btree_xlog_cleanup: right block unfound");
! 			rpage = (Page) BufferGetPage(rbuf);
! 			rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
! 
! 			/* if the pages are all of their level, it's a only-page split */
! 			is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
! 
! 			reln = CreateFakeRelcacheEntry(action->node);
! 			_bt_insert_parent(reln, lbuf, rbuf, NULL,
! 							  action->is_root, is_only);
! 			FreeFakeRelcacheEntry(reln);
! 		}
! 		else
! 		{
! 			/* finish an incomplete deletion (of a half-dead page) */
! 			Buffer		buf;
! 
! 			buf = XLogReadBuffer(action->node, action->delblk, false);
! 			if (BufferIsValid(buf))
! 			{
! 				Relation	reln;
! 
! 				reln = CreateFakeRelcacheEntry(action->node);
! 				if (_bt_pagedel(reln, buf, NULL) == 0)
! 					elog(PANIC, "btree_xlog_cleanup: _bt_pagedel failed");
! 				FreeFakeRelcacheEntry(reln);
! 			}
! 		}
  	}
! 	incomplete_actions = NIL;
  }
  
  bool
  btree_safe_restartpoint(void)
  {
! 	if (incomplete_actions)
  		return false;
  	return true;
  }
--- 1072,1118 ----
  {
  	ListCell   *l;
  
! 	foreach(l, incomplete_splits)
  	{
! 		/* finish an incomplete split */
! 		bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
! 		Buffer		lbuf,
! 					rbuf;
! 		Page		lpage,
! 					rpage;
! 		BTPageOpaque lpageop,
! 					rpageop;
! 		bool		is_only;
! 		Relation	reln;
! 
! 		lbuf = XLogReadBuffer(action->node, action->leftblk, false);
! 		/* failure is impossible because we wrote this page earlier */
! 		if (!BufferIsValid(lbuf))
! 			elog(PANIC, "btree_xlog_cleanup: left block unfound");
! 		lpage = (Page) BufferGetPage(lbuf);
! 		lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
! 		rbuf = XLogReadBuffer(action->node, action->rightblk, false);
! 		/* failure is impossible because we wrote this page earlier */
! 		if (!BufferIsValid(rbuf))
! 			elog(PANIC, "btree_xlog_cleanup: right block unfound");
! 		rpage = (Page) BufferGetPage(rbuf);
! 		rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
! 
! 		/* if the pages are all of their level, it's a only-page split */
! 		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
! 
! 		reln = CreateFakeRelcacheEntry(action->node);
! 		_bt_insert_parent(reln, lbuf, rbuf, NULL,
! 						  action->is_root, is_only);
! 		FreeFakeRelcacheEntry(reln);
  	}
! 	incomplete_splits = NIL;
  }
  
  bool
  btree_safe_restartpoint(void)
  {
! 	if (incomplete_splits)
  		return false;
  	return true;
  }
*** a/src/backend/access/rmgrdesc/nbtdesc.c
--- b/src/backend/access/rmgrdesc/nbtdesc.c
***************
*** 124,139 **** btree_desc(StringInfo buf, uint8 xl_info, char *rec)
  								 xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
  				break;
  			}
! 		case XLOG_BTREE_DELETE_PAGE:
! 		case XLOG_BTREE_DELETE_PAGE_META:
! 		case XLOG_BTREE_DELETE_PAGE_HALF:
  			{
! 				xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
  
! 				appendStringInfoString(buf, "delete_page: ");
  				out_target(buf, &(xlrec->target));
! 				appendStringInfo(buf, "; dead %u; left %u; right %u",
! 							xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
  				break;
  			}
  		case XLOG_BTREE_NEWROOT:
--- 124,149 ----
  								 xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
  				break;
  			}
! 		case XLOG_BTREE_MARK_PAGE_HALFDEAD:
  			{
! 				xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) rec;
  
! 				appendStringInfoString(buf, "mark_page_halfdead: ");
  				out_target(buf, &(xlrec->target));
! 				appendStringInfo(buf, "; downlink %u; leaf %u; left %u; right %u",
! 								 xlrec->leafblk, xlrec->leftblk, xlrec->rightblk, xlrec->downlink);
! 				break;
! 			}
! 		case XLOG_BTREE_UNLINK_PAGE_META:
! 		case XLOG_BTREE_UNLINK_PAGE:
! 			{
! 				xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;
! 
! 				appendStringInfoString(buf, "unlink_page: ");
! 				appendStringInfo(buf, "rel %u/%u/%u; ",
! 								 xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
! 				appendStringInfo(buf, "; dead %u; left %u; right %u; leaf %u; downlink %u, btpo_xact %u",
! 								 xlrec->deadblk, xlrec->leftblk, xlrec->rightblk, xlrec->leafblk, xlrec->downlink, xlrec->btpo_xact);
  				break;
  			}
  		case XLOG_BTREE_NEWROOT:
*** a/src/include/access/nbtree.h
--- b/src/include/access/nbtree.h
***************
*** 215,225 **** typedef struct BTMetaPageData
  #define XLOG_BTREE_SPLIT_L_ROOT 0x50	/* add tuple with split of root */
  #define XLOG_BTREE_SPLIT_R_ROOT 0x60	/* as above, new item on right */
  #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
! #define XLOG_BTREE_DELETE_PAGE	0x80	/* delete an entire page */
! #define XLOG_BTREE_DELETE_PAGE_META 0x90		/* same, and update metapage */
  #define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
! #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0		/* page deletion that makes
! 												 * parent half-dead */
  #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during
  										 * vacuum */
  #define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from
--- 215,224 ----
  #define XLOG_BTREE_SPLIT_L_ROOT 0x50	/* add tuple with split of root */
  #define XLOG_BTREE_SPLIT_R_ROOT 0x60	/* as above, new item on right */
  #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
! #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
! #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
  #define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
! #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
  #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during
  										 * vacuum */
  #define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from
***************
*** 370,392 **** typedef struct xl_btree_vacuum
  #define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
  
  /*
!  * This is what we need to know about deletion of a btree page.  The target
!  * identifies the tuple removed from the parent page (note that we remove
!  * this tuple's downlink and the *following* tuple's key).	Note we do not
!  * store any content for the deleted page --- it is just rewritten as empty
!  * during recovery, apart from resetting the btpo.xact.
   */
! typedef struct xl_btree_delete_page
  {
  	xl_btreetid target;			/* deleted tuple id in parent page */
! 	BlockNumber deadblk;		/* child block being deleted */
! 	BlockNumber leftblk;		/* child block's left sibling, if any */
! 	BlockNumber rightblk;		/* child block's right sibling */
  	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
! 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
! } xl_btree_delete_page;
  
! #define SizeOfBtreeDeletePage	(offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
  
  /*
   * New root log record.  There are zero tuples if this is to establish an
--- 369,410 ----
  #define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
  
  /*
!  * This is what we need to know about marking an empty branch for deletion.
!  * The target identifies the tuple removed from the parent page (note that we
!  * remove this tuple's downlink and the *following* tuple's key).  Note that
!  * the leaf page is empty, so we don't need to store its content --- it is
!  * just reinitialized during recovery using the rest of the fields.
   */
! typedef struct xl_btree_mark_page_halfdead
  {
  	xl_btreetid target;			/* deleted tuple id in parent page */
! 	BlockNumber leafblk;		/* leaf block ultimately being deleted */
! 	BlockNumber leftblk;		/* leaf block's left sibling, if any */
! 	BlockNumber rightblk;		/* leaf block's right sibling */
! 	BlockNumber downlink;		/* next child down in the branch */
! } xl_btree_mark_page_halfdead;
! 
! #define SizeOfBtreeMarkPageHalfDead	(offsetof(xl_btree_mark_page_halfdead, leafblk) + sizeof(BlockNumber))
! 
! /*
!  * This is what we need to know about deletion of a btree page.  Note we do
!  * not store any content for the deleted page --- it is just rewritten as empty
!  * during recovery, apart from resetting the btpo.xact.
!  */
! typedef struct xl_btree_unlink_page
! {
! 	RelFileNode	node;
! 	BlockNumber deadblk;		/* target block being deleted */
! 	BlockNumber leftblk;		/* taregt block's left sibling, if any */
! 	BlockNumber rightblk;		/* target block's right sibling */
! 	BlockNumber	leafblk;		/* if target is aninternal page, the leaf of
! 								 * the branch */
! 	BlockNumber	downlink;		/* next child down in the branch */
  	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
! 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
! } xl_btree_unlink_page;
  
! #define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
  
  /*
   * New root log record.  There are zero tuples if this is to establish an
