Index: nbtpage.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v
retrieving revision 1.89
diff -c -r1.89 nbtpage.c
*** nbtpage.c	6 Nov 2005 19:29:00 -0000	1.89
--- nbtpage.c	7 Nov 2005 22:53:41 -0000
***************
*** 205,223 ****
  		if (metad->btm_root != P_NONE)
  		{
  			/*
! 			 * Metadata initialized by someone else.  In order to guarantee no
! 			 * deadlocks, we have to release the metadata page and start all
! 			 * over again.	(Is that really true? But it's hardly worth trying
! 			 * to optimize this case.)
  			 */
  			_bt_relbuf(rel, metabuf);
  			return _bt_getroot(rel, access);
  		}
  
  		/*
! 		 * Get, initialize, write, and leave a lock of the appropriate type on
! 		 * the new root page.  Since this is the first page in the tree, it's
! 		 * a leaf as well as the root.
  		 */
  		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
  		rootblkno = BufferGetBlockNumber(rootbuf);
--- 205,223 ----
  		if (metad->btm_root != P_NONE)
  		{
  			/*
! 			 * Metadata initialized by someone else.  In order to guarantee
! 			 * no deadlocks, we have to release the metadata page and start
! 			 * all over again.	(Is that really true? But it's hardly worth
! 			 * trying to optimize this case.)
  			 */
  			_bt_relbuf(rel, metabuf);
  			return _bt_getroot(rel, access);
  		}
  
  		/*
! 		 * Get, initialize, write, and leave a lock of the appropriate type
! 		 * on the new root page.  Since this is the first page in the tree,
! 		 * it's a leaf as well as the root.
  		 */
  		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
  		rootblkno = BufferGetBlockNumber(rootbuf);
***************
*** 412,427 ****
  	Page		page = BufferGetPage(buf);
  
  	/*
! 	 * ReadBuffer verifies that every newly-read page passes PageHeaderIsValid,
! 	 * which means it either contains a reasonably sane page header or is
! 	 * all-zero.  We have to defend against the all-zero case, however.
  	 */
  	if (PageIsNew(page))
  		ereport(ERROR,
  				(errcode(ERRCODE_INDEX_CORRUPTED),
! 				 errmsg("index \"%s\" contains unexpected zero page at block %u",
! 						RelationGetRelationName(rel),
! 						BufferGetBlockNumber(buf)),
  				 errhint("Please REINDEX it.")));
  
  	/*
--- 412,428 ----
  	Page		page = BufferGetPage(buf);
  
  	/*
! 	 * ReadBuffer verifies that every newly-read page passes
! 	 * PageHeaderIsValid, which means it either contains a reasonably sane
! 	 * page header or is all-zero.	We have to defend against the all-zero
! 	 * case, however.
  	 */
  	if (PageIsNew(page))
  		ereport(ERROR,
  				(errcode(ERRCODE_INDEX_CORRUPTED),
! 			errmsg("index \"%s\" contains unexpected zero page at block %u",
! 				   RelationGetRelationName(rel),
! 				   BufferGetBlockNumber(buf)),
  				 errhint("Please REINDEX it.")));
  
  	/*
***************
*** 440,446 ****
  /*
   *	_bt_getbuf() -- Get a buffer by block number for read or write.
   *
!  *		blkno == P_NEW means to get an unallocated index page.  The page
   *		will be initialized before returning it.
   *
   *		When this routine returns, the appropriate lock is set on the
--- 441,447 ----
  /*
   *	_bt_getbuf() -- Get a buffer by block number for read or write.
   *
!  *		blkno == P_NEW means to get an unallocated index page.	The page
   *		will be initialized before returning it.
   *
   *		When this routine returns, the appropriate lock is set on the
***************
*** 480,495 ****
  		 * it, or even worse our own caller does, we could deadlock.  (The
  		 * own-caller scenario is actually not improbable. Consider an index
  		 * on a serial or timestamp column.  Nearly all splits will be at the
! 		 * rightmost page, so it's entirely likely that _bt_split will call us
! 		 * while holding a lock on the page most recently acquired from FSM.
! 		 * A VACUUM running concurrently with the previous split could well
! 		 * have placed that page back in FSM.)
  		 *
! 		 * To get around that, we ask for only a conditional lock on the reported
! 		 * page.  If we fail, then someone else is using the page, and we may
! 		 * reasonably assume it's not free.  (If we happen to be wrong, the
! 		 * worst consequence is the page will be lost to use till the next
! 		 * VACUUM, which is no big problem.)
  		 */
  		for (;;)
  		{
--- 481,496 ----
  		 * it, or even worse our own caller does, we could deadlock.  (The
  		 * own-caller scenario is actually not improbable. Consider an index
  		 * on a serial or timestamp column.  Nearly all splits will be at the
! 		 * rightmost page, so it's entirely likely that _bt_split will call
! 		 * us while holding a lock on the page most recently acquired from
! 		 * FSM. A VACUUM running concurrently with the previous split could
! 		 * well have placed that page back in FSM.)
  		 *
! 		 * To get around that, we ask for only a conditional lock on the
! 		 * reported page.  If we fail, then someone else is using the page,
! 		 * and we may reasonably assume it's not free.  (If we happen to be
! 		 * wrong, the worst consequence is the page will be lost to use till
! 		 * the next VACUUM, which is no big problem.)
  		 */
  		for (;;)
  		{
***************
*** 649,658 ****
  	BTPageOpaque opaque;
  
  	/*
! 	 * It's possible to find an all-zeroes page in an index --- for example, a
! 	 * backend might successfully extend the relation one page and then crash
! 	 * before it is able to make a WAL entry for adding the page. If we find a
! 	 * zeroed page then reclaim it.
  	 */
  	if (PageIsNew(page))
  		return true;
--- 650,659 ----
  	BTPageOpaque opaque;
  
  	/*
! 	 * It's possible to find an all-zeroes page in an index --- for example,
! 	 * a backend might successfully extend the relation one page and then
! 	 * crash before it is able to make a WAL entry for adding the page. If we
! 	 * find a zeroed page then reclaim it.
  	 */
  	if (PageIsNew(page))
  		return true;
***************
*** 782,789 ****
  	BTPageOpaque opaque;
  
  	/*
! 	 * We can never delete rightmost pages nor root pages.	While at it, check
! 	 * that page is not already deleted and is empty.
  	 */
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
--- 783,790 ----
  	BTPageOpaque opaque;
  
  	/*
! 	 * We can never delete rightmost pages nor root pages.	While at it,
! 	 * check that page is not already deleted and is empty.
  	 */
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
***************
*** 808,815 ****
  	 * We need to get an approximate pointer to the page's parent page. Use
  	 * the standard search mechanism to search for the page's high key; this
  	 * will give us a link to either the current parent or someplace to its
! 	 * left (if there are multiple equal high keys).  To avoid deadlocks, we'd
! 	 * better drop the target page lock first.
  	 */
  	_bt_relbuf(rel, buf);
  	/* we need a scan key to do our search, so build one */
--- 809,816 ----
  	 * We need to get an approximate pointer to the page's parent page. Use
  	 * the standard search mechanism to search for the page's high key; this
  	 * will give us a link to either the current parent or someplace to its
! 	 * left (if there are multiple equal high keys).  To avoid deadlocks,
! 	 * we'd better drop the target page lock first.
  	 */
  	_bt_relbuf(rel, buf);
  	/* we need a scan key to do our search, so build one */
***************
*** 843,851 ****
  	 * page.  The sibling that was current a moment ago could have split, so
  	 * we may have to move right.  This search could fail if either the
  	 * sibling or the target page was deleted by someone else meanwhile; if
! 	 * so, give up.  (Right now, that should never happen, since page deletion
! 	 * is only done in VACUUM and there shouldn't be multiple VACUUMs
! 	 * concurrently on the same table.)
  	 */
  	if (leftsib != P_NONE)
  	{
--- 844,852 ----
  	 * page.  The sibling that was current a moment ago could have split, so
  	 * we may have to move right.  This search could fail if either the
  	 * sibling or the target page was deleted by someone else meanwhile; if
! 	 * so, give up.  (Right now, that should never happen, since page
! 	 * deletion is only done in VACUUM and there shouldn't be multiple
! 	 * VACUUMs concurrently on the same table.)
  	 */
  	if (leftsib != P_NONE)
  	{
***************
*** 872,880 ****
  		lbuf = InvalidBuffer;
  
  	/*
! 	 * Next write-lock the target page itself.	It should be okay to take just
! 	 * a write lock not a superexclusive lock, since no scans would stop on an
! 	 * empty page.
  	 */
  	buf = _bt_getbuf(rel, target, BT_WRITE);
  	page = BufferGetPage(buf);
--- 873,881 ----
  		lbuf = InvalidBuffer;
  
  	/*
! 	 * Next write-lock the target page itself.	It should be okay to take
! 	 * just a write lock not a superexclusive lock, since no scans would stop
! 	 * on an empty page.
  	 */
  	buf = _bt_getbuf(rel, target, BT_WRITE);
  	page = BufferGetPage(buf);
***************
*** 904,911 ****
  	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
  
  	/*
! 	 * Next find and write-lock the current parent of the target page. This is
! 	 * essentially the same as the corresponding step of splitting.
  	 */
  	ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
  				   target, P_HIKEY);
--- 905,912 ----
  	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
  
  	/*
! 	 * Next find and write-lock the current parent of the target page. This
! 	 * is essentially the same as the corresponding step of splitting.
  	 */
  	ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
  				   target, P_HIKEY);
***************
*** 949,957 ****
  
  	/*
  	 * If we are deleting the next-to-last page on the target's level, then
! 	 * the rightsib is a candidate to become the new fast root. (In theory, it
! 	 * might be possible to push the fast root even further down, but the odds
! 	 * of doing so are slim, and the locking considerations daunting.)
  	 *
  	 * We can safely acquire a lock on the metapage here --- see comments for
  	 * _bt_newroot().
--- 950,958 ----
  
  	/*
  	 * If we are deleting the next-to-last page on the target's level, then
! 	 * the rightsib is a candidate to become the new fast root. (In theory,
! 	 * it might be possible to push the fast root even further down, but the
! 	 * odds of doing so are slim, and the locking considerations daunting.)
  	 *
  	 * We can safely acquire a lock on the metapage here --- see comments for
  	 * _bt_newroot().
***************
*** 992,999 ****
  	/*
  	 * Update parent.  The normal case is a tad tricky because we want to
  	 * delete the target's downlink and the *following* key.  Easiest way is
! 	 * to copy the right sibling's downlink over the target downlink, and then
! 	 * delete the following item.
  	 */
  	page = BufferGetPage(pbuf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
--- 993,1000 ----
  	/*
  	 * Update parent.  The normal case is a tad tricky because we want to
  	 * delete the target's downlink and the *following* key.  Easiest way is
! 	 * to copy the right sibling's downlink over the target downlink, and
! 	 * then delete the following item.
  	 */
  	page = BufferGetPage(pbuf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
***************
*** 1154,1162 ****
  
  	/*
  	 * If parent became half dead, recurse to try to delete it. Otherwise, if
! 	 * right sibling is empty and is now the last child of the parent, recurse
! 	 * to try to delete it.  (These cases cannot apply at the same time,
! 	 * though the second case might itself recurse to the first.)
  	 */
  	if (parent_half_dead)
  	{
--- 1155,1163 ----
  
  	/*
  	 * If parent became half dead, recurse to try to delete it. Otherwise, if
! 	 * right sibling is empty and is now the last child of the parent,
! 	 * recurse to try to delete it.  (These cases cannot apply at the same
! 	 * time, though the second case might itself recurse to the first.)
  	 */
  	if (parent_half_dead)
  	{
