2Q implementaion for PostgreSQL buffer replacement.

Started by Yutaka tanidaover 22 years ago5 messages
#1Yutaka tanida
yutaka@nonsensecorner.com
3 attachment(s)

Hi.

I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.

http://www.vldb.org/conf/1994/P439.PDF

In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.

Do you have any interest about this patch?

--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/

Attachments:

qq_1.patchapplication/octet-stream; name=qq_1.patchDownload
*** postgresql-7.3.2/src/include/storage/buf_internals.h	Thu Sep  5 05:31:45 2002
--- /tmp/postgresql-7.3.2.tar/postgresql-7.3.2/src/include/storage/buf_internals.h	Tue Jun 17 13:09:24 2003
***************
*** 80,85 ****
--- 80,87 ----
  {
  	Buffer		freeNext;		/* links for freelist chain */
  	Buffer		freePrev;
+   int         freeState;       /* used for any buffer replacement strategy*/
+   
  	SHMEM_OFFSET data;			/* pointer to data in buf pool */
  
  	/* tag and id must be together for table lookup (still true?) */
buf_init.c.diffapplication/octet-stream; name=buf_init.c.diffDownload
*** buf_init.c.orig	Thu Sep 26 05:31:40 2002
--- buf_init.c	Wed Jun 18 14:29:08 2003
***************
*** 35,40 ****
--- 35,42 ----
  #include "utils/memutils.h"
  
  
+ #define FREELIST_QUEUE_SIZE 3
+ 
  /*
   *	if BMTRACE is defined, we trace the last 200 buffer allocations and
   *	deallocations in a circular buffer in shared memory.
***************
*** 50,55 ****
--- 52,58 ----
  int			Data_Descriptors;
  int			Free_List_Descriptor;
  int			Lookup_List_Descriptor;
+ int         AmQueue_List_Descriptor;
  int			Num_Descriptors;
  
  BufferDesc *BufferDescriptors;
***************
*** 134,141 ****
  
  	Data_Descriptors = NBuffers;
  	Free_List_Descriptor = Data_Descriptors;
! 	Lookup_List_Descriptor = Data_Descriptors + 1;
! 	Num_Descriptors = Data_Descriptors + 1;
  
  	/*
  	 * It's probably not really necessary to grab the lock --- if there's
--- 137,145 ----
  
  	Data_Descriptors = NBuffers;
  	Free_List_Descriptor = Data_Descriptors;
! 	AmQueue_List_Descriptor = Data_Descriptors+1;
! 	Lookup_List_Descriptor = Data_Descriptors +2;
! 	Num_Descriptors = Data_Descriptors + FREELIST_QUEUE_SIZE+1;
  
  	/*
  	 * It's probably not really necessary to grab the lock --- if there's
***************
*** 186,197 ****
  
  			buf->freeNext = i + 1;
  			buf->freePrev = i - 1;
! 
  			CLEAR_BUFFERTAG(&(buf->tag));
  			buf->buf_id = i;
  
  			buf->data = MAKE_OFFSET(block);
  			buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
  			buf->refcount = 0;
  			buf->io_in_progress_lock = LWLockAssign();
  			buf->cntx_lock = LWLockAssign();
--- 190,202 ----
  
  			buf->freeNext = i + 1;
  			buf->freePrev = i - 1;
! 			buf->freeState=-1; 
  			CLEAR_BUFFERTAG(&(buf->tag));
  			buf->buf_id = i;
  
  			buf->data = MAKE_OFFSET(block);
  			buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
+ 			
  			buf->refcount = 0;
  			buf->io_in_progress_lock = LWLockAssign();
  			buf->cntx_lock = LWLockAssign();
***************
*** 258,264 ****
  	size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
  
  	/* size of buffer descriptors */
! 	size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
  
  	/* size of data pages */
  	size += NBuffers * MAXALIGN(BLCKSZ);
--- 263,269 ----
  	size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
  
  	/* size of buffer descriptors */
! 	size += MAXALIGN((NBuffers + FREELIST_QUEUE_SIZE) * sizeof(BufferDesc));
  
  	/* size of data pages */
  	size += NBuffers * MAXALIGN(BLCKSZ);
***************
*** 272,274 ****
--- 277,289 ----
  
  	return size;
  }
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
freelist.c.diffapplication/octet-stream; name=freelist.c.diffDownload
*** freelist.c.orig	Fri Jun 21 05:29:34 2002
--- freelist.c	Wed Jun 18 16:16:28 2003
***************
*** 32,58 ****
  #include "storage/ipc.h"
  #include "storage/proc.h"
  
  
! static BufferDesc *SharedFreeList;
! 
  /*
   * State-checking macros
   */
  
  #define IsInQueue(bf) \
  ( \
! 	AssertMacro((bf->freeNext != INVALID_DESCRIPTOR)), \
! 	AssertMacro((bf->freePrev != INVALID_DESCRIPTOR)), \
  	AssertMacro((bf->flags & BM_FREE)) \
  )
  
  #define IsNotInQueue(bf) \
  ( \
! 	AssertMacro((bf->freeNext == INVALID_DESCRIPTOR)), \
! 	AssertMacro((bf->freePrev == INVALID_DESCRIPTOR)), \
  	AssertMacro(! (bf->flags & BM_FREE)) \
  )
  
  
  /*
   * AddBufferToFreelist
--- 32,64 ----
  #include "storage/ipc.h"
  #include "storage/proc.h"
  
+ extern long int AmQueue_List_Descriptor;
+ extern long int A1outQueue_List_Descriptor;
  
! static BufferDesc *A1QueueFreeList;
! static BufferDesc *A1OutQueueFreeList;
! static BufferDesc *AmQueueFreeList;
  /*
   * State-checking macros
   */
  
  #define IsInQueue(bf) \
  ( \
!  	AssertMacro((bf->freeNext != INVALID_DESCRIPTOR)), \
!  	AssertMacro((bf->freePrev != INVALID_DESCRIPTOR)), \
  	AssertMacro((bf->flags & BM_FREE)) \
  )
  
  #define IsNotInQueue(bf) \
  ( \
!  	AssertMacro((bf->freeNext == INVALID_DESCRIPTOR)), \
!  	AssertMacro((bf->freePrev == INVALID_DESCRIPTOR)), \
  	AssertMacro(! (bf->flags & BM_FREE)) \
  )
  
+ static int addAm=0;
+ static int addAll=0;
+ static int addA1=0;
  
  /*
   * AddBufferToFreelist
***************
*** 65,82 ****
  static void
  AddBufferToFreelist(BufferDesc *bf)
  {
  #ifdef BMTRACE
  	_bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
  			  BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
  #endif   /* BMTRACE */
  	IsNotInQueue(bf);
  
- 	/* change bf so it points to inFrontOfNew and its successor */
- 	bf->freePrev = SharedFreeList->freePrev;
- 	bf->freeNext = Free_List_Descriptor;
  
! 	/* insert new into chain */
! 	BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
  	BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
  }
  
--- 71,132 ----
  static void
  AddBufferToFreelist(BufferDesc *bf)
  {
+   BufferDesc *buf;
  #ifdef BMTRACE
  	_bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
  			  BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
  #endif   /* BMTRACE */
  	IsNotInQueue(bf);
+ 	/* in a1in queue or not in  */
+ 	addAll++;
+ 	if(bf->freeState == -1 ) { /* PUT INTO IN A1IN_QUEUE */
+ 	  /* change bf so it points to inFrontOfNew and its successor */
+ 	/* insert new into chain */
+ 	  bf->freePrev = A1QueueFreeList->freePrev;
+ 	  bf->freeNext = Free_List_Descriptor;
  
  
! 	  A1QueueFreeList->freeState++;
! 	  addA1++;
! 	  bf->freeState=0; /* in A1 queue */
!     }else  { /* was in a1out queue or am queue*/
! 	  /* change bf so it points to inFrontOfNew and its successor */
! 	  /* add to Am Queue*/
! 	  bf->freePrev = AmQueueFreeList->freePrev;
! 	  bf->freeNext = AmQueue_List_Descriptor;
! 
! 	  AmQueueFreeList->freeState++;
! 	  addAm++;
! 		/* if there are so many entry in Am queue ,moving am queue to a1 queue */
!       if (AmQueueFreeList->freeState >= A1QueueFreeList->freeState) {
! 
! 		/* determine buffer removing from Am queue */
! 
!  	    buf=&(BufferDescriptors[AmQueueFreeList->freeNext]);
! 
!         /* remove from Am */
! 	    BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
! 	    BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
! 	    AmQueueFreeList->freeState--;
! 
! 
! 	    /* add to A1*/
! 		
! 	    buf->freePrev=A1QueueFreeList->freePrev;
! 	    buf->freeNext=Free_List_Descriptor;
! 	    BufferDescriptors[buf->freeNext].freePrev = buf->buf_id;
! 	    BufferDescriptors[buf->freePrev].freeNext = buf->buf_id;
! 	    A1QueueFreeList->freeState++;
! 		
! 		buf->freeState=0; /* in A1 queue */
! 	  }
! 	  bf->freeState=1; /* in Am queue */
! 	}
! 	if(addAll%256==0) {
! 	  //elog(NOTICE,"all %d  A1 : %d  Am : %d",addAll,addA1,addAm);
! 	}
! 
!     BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
  	BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
  }
  
***************
*** 98,107 ****
  		IsInQueue(buf);
  
  		/* remove from freelist queue */
  		BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
  		BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
  		buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
! 
  		/* mark buffer as no longer free */
  		buf->flags &= ~BM_FREE;
  	}
--- 148,163 ----
  		IsInQueue(buf);
  
  		/* remove from freelist queue */
+ 		
  		BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
  		BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+ 		
  		buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
! 		if(buf->freeState==0) {
! 		  A1QueueFreeList->freeState--;
! 		}else if(buf->freeState==1) {
! 		  AmQueueFreeList->freeState--;
! 		}
  		/* mark buffer as no longer free */
  		buf->flags &= ~BM_FREE;
  	}
***************
*** 187,215 ****
  }
  #endif
  
  /*
   * GetFreeBuffer() -- get the 'next' buffer from the freelist.
   */
  BufferDesc *
  GetFreeBuffer(void)
  {
! 	BufferDesc *buf;
! 
! 	if (Free_List_Descriptor == SharedFreeList->freeNext)
! 	{
! 		/* queue is empty. All buffers in the buffer pool are pinned. */
! 		elog(ERROR, "out of free buffers: time to abort!");
! 		return NULL;
  	}
- 	buf = &(BufferDescriptors[SharedFreeList->freeNext]);
  
! 	/* remove from freelist queue */
  	BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
  	BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
  	buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
  
  	buf->flags &= ~(BM_FREE);
  
  	return buf;
  }
  
--- 243,284 ----
  }
  #endif
  
+ 
  /*
   * GetFreeBuffer() -- get the 'next' buffer from the freelist.
   */
  BufferDesc *
  GetFreeBuffer(void)
  {
!   BufferDesc *buf;
! 	if (Free_List_Descriptor != A1QueueFreeList->freeNext) {
! 	  /* A1 queue is not empty. get from it.*/
! 	  buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
! 	  A1QueueFreeList->freeState--;
! 	}else if (AmQueue_List_Descriptor != AmQueueFreeList->freeNext) {
! 	  /* A1 queue is empty but Am queue is not empty get from it */
! 	  buf = &(BufferDescriptors[AmQueueFreeList->freeNext]);
! 	  AmQueueFreeList->freeState--;
! 	}else if (Free_List_Descriptor == A1QueueFreeList->freeNext  &&
! 			  AmQueue_List_Descriptor != AmQueueFreeList->freeNext) {
! 	  /* all queue is empty. All buffers in the buffer pool are pinned. */
! 	  elog(ERROR, "out of free buffers: time to abort!");
! 	  return NULL;
! 	}else {
! 	  elog(ERROR, "will never occur");
! 	  return NULL;
  	}
  
! 	  /* remove from freelist queue */
  	BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
  	BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
  	buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
  
  	buf->flags &= ~(BM_FREE);
  
+ 
+ 
+     buf->freeState=-1; /* free */
  	return buf;
  }
  
***************
*** 224,245 ****
  void
  InitFreeList(bool init)
  {
! 	SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
  
  	if (init)
  	{
  		/* we only do this once, normally in the postmaster */
! 		SharedFreeList->data = INVALID_OFFSET;
! 		SharedFreeList->flags = 0;
! 		SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
! 		SharedFreeList->buf_id = Free_List_Descriptor;
  
  		/* insert it into a random spot in the circular queue */
! 		SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
! 		SharedFreeList->freePrev = 0;
! 		BufferDescriptors[SharedFreeList->freeNext].freePrev =
! 			BufferDescriptors[SharedFreeList->freePrev].freeNext =
  			Free_List_Descriptor;
  	}
  }
  
--- 293,330 ----
  void
  InitFreeList(bool init)
  {
!   Buffer i;
! 	A1QueueFreeList = &(BufferDescriptors[Free_List_Descriptor]);
! 	AmQueueFreeList = &(BufferDescriptors[AmQueue_List_Descriptor]);
  
  	if (init)
  	{
  		/* we only do this once, normally in the postmaster */
! 		A1QueueFreeList->data = INVALID_OFFSET;
! 		A1QueueFreeList->flags = 0;
! 		A1QueueFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
! 		A1QueueFreeList->buf_id = Free_List_Descriptor;
  
  		/* insert it into a random spot in the circular queue */
! 		A1QueueFreeList->freeNext = BufferDescriptors[0].freeNext;
! 		A1QueueFreeList->freePrev = 0;
! 		//		A1QueueFreeList->freeState= ;
! 		BufferDescriptors[A1QueueFreeList->freeNext].freePrev =
! 			BufferDescriptors[A1QueueFreeList->freePrev].freeNext =
  			Free_List_Descriptor;
+ 		for(i=A1QueueFreeList->freeNext;i!=Free_List_Descriptor;i=BufferDescriptors[i].freeNext) {
+ 		  A1QueueFreeList->freeState++;
+ 		}
+ 
+ 		AmQueueFreeList->data= INVALID_OFFSET;
+ 		AmQueueFreeList->flags=0;
+ 		AmQueueFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
+ 		AmQueueFreeList->buf_id = AmQueue_List_Descriptor;
+ 
+ 		/* Am Queue was initilized as empty.*/
+ 		AmQueueFreeList->freeNext = AmQueue_List_Descriptor;
+ 		AmQueueFreeList->freePrev = AmQueue_List_Descriptor;
+ 		AmQueueFreeList->freeState=0;
  	}
  }
  
***************
*** 254,265 ****
  	int			i;
  	BufferDesc *buf;
  
! 	buf = &(BufferDescriptors[SharedFreeList->freeNext]);
  	for (i = 0; i < nfree; i++, buf = &(BufferDescriptors[buf->freeNext]))
  	{
  		if (!(buf->flags & (BM_FREE)))
  		{
! 			if (buf != SharedFreeList)
  				printf("\tfree list corrupted: %d flags %x\n",
  					   buf->buf_id, buf->flags);
  			else
--- 339,350 ----
  	int			i;
  	BufferDesc *buf;
  
! 	buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
  	for (i = 0; i < nfree; i++, buf = &(BufferDescriptors[buf->freeNext]))
  	{
  		if (!(buf->flags & (BM_FREE)))
  		{
! 			if (buf != A1QueueFreeList)
  				printf("\tfree list corrupted: %d flags %x\n",
  					   buf->buf_id, buf->flags);
  			else
***************
*** 271,277 ****
  			printf("\tfree list links corrupted: %d %ld %ld\n",
  				   buf->buf_id, buf->freePrev, buf->freeNext);
  	}
! 	if (buf != SharedFreeList)
  		printf("\tfree list corrupted: %d-th buffer is %d\n",
  			   nfree, buf->buf_id);
  }
--- 356,362 ----
  			printf("\tfree list links corrupted: %d %ld %ld\n",
  				   buf->buf_id, buf->freePrev, buf->freeNext);
  	}
! 	if (buf != A1QueueFreeList)
  		printf("\tfree list corrupted: %d-th buffer is %d\n",
  			   nfree, buf->buf_id);
  }
***************
*** 287,299 ****
  {
  	BufferDesc *buf;
  
! 	if (SharedFreeList->freeNext == Free_List_Descriptor)
  	{
  		printf("free list is empty.\n");
  		return;
  	}
  
! 	buf = &(BufferDescriptors[SharedFreeList->freeNext]);
  	for (;;)
  	{
  		int			i = (buf - BufferDescriptors);
--- 372,384 ----
  {
  	BufferDesc *buf;
  
! 	if (A1QueueFreeList->freeNext == Free_List_Descriptor)
  	{
  		printf("free list is empty.\n");
  		return;
  	}
  
! 	buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
  	for (;;)
  	{
  		int			i = (buf - BufferDescriptors);
#2Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Yutaka tanida (#1)
Re: 2Q implementaion for PostgreSQL buffer replacement.

Looks good to me --- we will include it in 7.4.

Your patch has been added to the PostgreSQL unapplied patches list at:

http://momjian.postgresql.org/cgi-bin/pgpatches

I will try to apply it within the next 48 hours.

---------------------------------------------------------------------------

Yutaka tanida wrote:

Hi.

I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.

http://www.vldb.org/conf/1994/P439.PDF

In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.

Do you have any interest about this patch?

--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/

[ Attachment, skipping... ]

[ Attachment, skipping... ]

[ Attachment, skipping... ]

---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
#3Yutaka tanida
yutaka@nonsensecorner.com
In reply to: Bruce Momjian (#2)
Re: 2Q implementaion for PostgreSQL buffer replacement.

On Mon, 23 Jun 2003 23:49:17 -0400 (EDT)
Bruce Momjian <pgman@candle.pha.pa.us> wrote:

Looks good to me --- we will include it in 7.4.

Thanks.But please note it is not completed yet. I must implement more ,
and move configurable parameter to postgresql.conf file.

--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/

#4Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Yutaka tanida (#3)
Re: 2Q implementaion for PostgreSQL buffer replacement.

OK, thanks. I will remove it from the queue, and someone suggested a
different algorithm today:

I was researching on cache replacement strategy as well. 2Q has one
disadvantage see this exellent paper:
http://www.almaden.ibm.com/cs/people/dmodha/#ARC see the paper
"ARC: A Self-Tuning, Low Overhead Replacement Cache" for theory and "One
Up on LRU" for implementation details. ARC requires no tuning and can
switch fast between chaging patterns. Best of all is it is resistant to a
"sequential scan" pattern. and i think it's even easier to implement then
2q :)

---------------------------------------------------------------------------

Yutaka tanida wrote:

On Mon, 23 Jun 2003 23:49:17 -0400 (EDT)
Bruce Momjian <pgman@candle.pha.pa.us> wrote:

Looks good to me --- we will include it in 7.4.

Thanks.But please note it is not completed yet. I must implement more ,
and move configurable parameter to postgresql.conf file.

--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/

---------------------------(end of broadcast)---------------------------
TIP 9: the planner will ignore your desire to choose an index scan if your
joining column's datatypes do not match

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
#5Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Yutaka tanida (#1)
Re: 2Q implementaion for PostgreSQL buffer replacement.

Patch removed at author's request.

---------------------------------------------------------------------------

Yutaka tanida wrote:

Hi.

I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.

http://www.vldb.org/conf/1994/P439.PDF

In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.

Do you have any interest about this patch?

--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/

[ Attachment, skipping... ]

[ Attachment, skipping... ]

[ Attachment, skipping... ]

---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073