Reuse the dead item on unique index.

Started by Atsushi Ogawaover 20 years ago3 messages
#1Atsushi Ogawa
atsushi.ogawa@gmail.com
1 attachment(s)

When _bt_check_unique finds a dead item that has same data as new
item, LP_DEAD is set to the item. Can we reuse this dead item instead
of inserting new item? If it is possible, the growth of btree index
can be reduced.

I think it is effective in the table like branches table of pgbench to
which the same item is updated many times.

An attached patch is test implementation of this idea. This patch is
not complete, but it might be useful to show this idea.

regards,

--- Atsushi Ogawa

Attachments:

uniq_index.patchtext/x-patch; charset=us-ascii; name=uniq_index.patchDownload
*** ./src/backend/access/nbtree/nbtinsert.c.orig	2005-10-08 09:57:33.316123472 +0900
--- ./src/backend/access/nbtree/nbtinsert.c	2005-10-08 11:44:39.725160800 +0900
***************
*** 40,46 ****
  
  static TransactionId _bt_check_unique(Relation rel, BTItem btitem,
  				 Relation heapRel, Buffer buf,
! 				 ScanKey itup_scankey);
  static void _bt_insertonpg(Relation rel, Buffer buf,
  			   BTStack stack,
  			   int keysz, ScanKey scankey,
--- 40,46 ----
  
  static TransactionId _bt_check_unique(Relation rel, BTItem btitem,
  				 Relation heapRel, Buffer buf,
! 				 ScanKey itup_scankey, OffsetNumber *reuse_off);
  static void _bt_insertonpg(Relation rel, Buffer buf,
  			   BTStack stack,
  			   int keysz, ScanKey scankey,
***************
*** 120,128 ****
  	 */
  	if (index_is_unique)
  	{
! 		TransactionId xwait;
  
! 		xwait = _bt_check_unique(rel, btitem, heapRel, buf, itup_scankey);
  
  		if (TransactionIdIsValid(xwait))
  		{
--- 120,130 ----
  	 */
  	if (index_is_unique)
  	{
! 		TransactionId	xwait;
! 		OffsetNumber	reuse_off = InvalidOffsetNumber;
  
! 		xwait = _bt_check_unique(rel, btitem, heapRel, buf, itup_scankey,
! 								 &reuse_off);
  
  		if (TransactionIdIsValid(xwait))
  		{
***************
*** 133,138 ****
--- 135,168 ----
  			_bt_freestack(stack);
  			goto top;
  		}
+ 
+ 		/*
+ 		 * When the reusable item was found, change TID of this item to
+ 		 * new value instead of executing _bt_insertonpg.
+ 		 */
+ 		if (reuse_off != InvalidOffsetNumber)
+ 		{
+ 			Page		page = BufferGetPage(buf);
+ 			ItemId		reuse_itemid = PageGetItemId(page, reuse_off);
+ 			BTItem		reuse_btitem = (BTItem)PageGetItem(page, reuse_itemid);
+ 
+ 			Assert(ItemIdDeleted(reuse_itemid));
+ 			Assert(_bt_isequal(RelationGetDescr(rel), page, reuse_off,
+ 							   natts, itup_scankey));
+ 
+ 			reuse_itemid->lp_flags &= ~LP_DELETE;
+ 			reuse_btitem->bti_itup.t_tid = btitem->bti_itup.t_tid;
+ 
+ 			_bt_wrtbuf(rel, buf);
+ 
+ 			/* be tidy */
+ 			_bt_freestack(stack);
+ 			_bt_freeskey(itup_scankey);
+ 
+ 			/* XXX: WAL stuff is necessary */
+ 
+ 			return;
+ 		}
  	}
  
  	/* do the insertion */
***************
*** 152,158 ****
   */
  static TransactionId
  _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
! 				 Buffer buf, ScanKey itup_scankey)
  {
  	TupleDesc	itupdesc = RelationGetDescr(rel);
  	int			natts = rel->rd_rel->relnatts;
--- 182,188 ----
   */
  static TransactionId
  _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
! 				 Buffer buf, ScanKey itup_scankey, OffsetNumber *reuse_off)
  {
  	TupleDesc	itupdesc = RelationGetDescr(rel);
  	int			natts = rel->rd_rel->relnatts;
***************
*** 162,167 ****
--- 192,199 ----
  	BTPageOpaque opaque;
  	Buffer		nbuf = InvalidBuffer;
  
+ 	Assert(*reuse_off == InvalidOffsetNumber);
+ 
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	maxoff = PageGetMaxOffsetNumber(page);
***************
*** 262,272 ****
--- 294,330 ----
  					{
  						curitemid->lp_flags |= LP_DELETE;
  						SetBufferCommitInfoNeedsSave(buf);
+ 
+ 						if (*reuse_off == InvalidOffsetNumber &&
+ 							page == BufferGetPage(buf))
+ 						{
+ 							/*
+ 							 * The reusable item was found on the current
+ 							 * write locking page. 
+ 							 */
+ 							*reuse_off = offset;
+ 						}
  					}
  					LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
  				}
  				ReleaseBuffer(hbuffer);
  			}
+ 			else
+ 			{
+ 				if (*reuse_off == InvalidOffsetNumber &&
+ 					page == BufferGetPage(buf))
+ 				{
+ 					if (!_bt_isequal(itupdesc, page, offset, natts,
+ 									 itup_scankey))
+ 						break;		/* we're past all the equal tuples */
+ 
+ 					/*
+ 					 * The reusable item was found on the current
+ 					 * write locking page. 
+ 					 */
+ 					*reuse_off = offset;
+ 				}
+ 			}
  		}
  
  		/*
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Atsushi Ogawa (#1)
Re: Reuse the dead item on unique index.

Atsushi Ogawa <atsushi.ogawa@gmail.com> writes:

When _bt_check_unique finds a dead item that has same data as new
item, LP_DEAD is set to the item. Can we reuse this dead item instead
of inserting new item?

This strikes me as a pretty bad idea for the same reason pointed out
recently in other threads: the notion of equality embodied in a btree
opclass' equals function may have little or nothing to do with true
identity. So your assumption that it's the "same" data is faulty.

Also, I'm dubious about the assumption that "can be marked LP_DELETED"
is the same as "can be physically removed right now". The side-effects
on indexscans happening concurrently with yours could be bad. At the
very least you'd need to obtain super-exclusive lock (cf btbulkdelete)
before doing the replacement.

regards, tom lane

#3Atsushi Ogawa
atsushi.ogawa@gmail.com
In reply to: Tom Lane (#2)
Re: Reuse the dead item on unique index.

Tom Lane <tgl@sss.pgh.pa.us>:

Atsushi Ogawa <atsushi.ogawa@gmail.com> writes:

When _bt_check_unique finds a dead item that has same data as new
item, LP_DEAD is set to the item. Can we reuse this dead item instead
of inserting new item?

This strikes me as a pretty bad idea for the same reason pointed out
recently in other threads: the notion of equality embodied in a btree
opclass' equals function may have little or nothing to do with true
identity. So your assumption that it's the "same" data is faulty.

Thanks, I understand the problem.
When the size of new item and dead item is the equal, the new item can
be overwrited at the position of the dead item.

Also, I'm dubious about the assumption that "can be marked LP_DELETED"
is the same as "can be physically removed right now". The side-effects
on indexscans happening concurrently with yours could be bad. At the
very least you'd need to obtain super-exclusive lock (cf btbulkdelete)
before doing the replacement.

I agree. I will add code that checks the refcount of buffer. If refcount
is 1, current process has super-exclusive lock, and we can overwrite the
dead item. If refcount > 1, I use _bt_insertonpg.

regards,

--- Atsushi Ogawa