2Q implementaion for PostgreSQL buffer replacement.
Hi.
I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.
http://www.vldb.org/conf/1994/P439.PDF
In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.
Do you have any interest about this patch?
--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/
Attachments:
qq_1.patchapplication/octet-stream; name=qq_1.patchDownload
*** postgresql-7.3.2/src/include/storage/buf_internals.h Thu Sep 5 05:31:45 2002
--- /tmp/postgresql-7.3.2.tar/postgresql-7.3.2/src/include/storage/buf_internals.h Tue Jun 17 13:09:24 2003
***************
*** 80,85 ****
--- 80,87 ----
{
Buffer freeNext; /* links for freelist chain */
Buffer freePrev;
+ int freeState; /* used for any buffer replacement strategy*/
+
SHMEM_OFFSET data; /* pointer to data in buf pool */
/* tag and id must be together for table lookup (still true?) */
buf_init.c.diffapplication/octet-stream; name=buf_init.c.diffDownload
*** buf_init.c.orig Thu Sep 26 05:31:40 2002
--- buf_init.c Wed Jun 18 14:29:08 2003
***************
*** 35,40 ****
--- 35,42 ----
#include "utils/memutils.h"
+ #define FREELIST_QUEUE_SIZE 3
+
/*
* if BMTRACE is defined, we trace the last 200 buffer allocations and
* deallocations in a circular buffer in shared memory.
***************
*** 50,55 ****
--- 52,58 ----
int Data_Descriptors;
int Free_List_Descriptor;
int Lookup_List_Descriptor;
+ int AmQueue_List_Descriptor;
int Num_Descriptors;
BufferDesc *BufferDescriptors;
***************
*** 134,141 ****
Data_Descriptors = NBuffers;
Free_List_Descriptor = Data_Descriptors;
! Lookup_List_Descriptor = Data_Descriptors + 1;
! Num_Descriptors = Data_Descriptors + 1;
/*
* It's probably not really necessary to grab the lock --- if there's
--- 137,145 ----
Data_Descriptors = NBuffers;
Free_List_Descriptor = Data_Descriptors;
! AmQueue_List_Descriptor = Data_Descriptors+1;
! Lookup_List_Descriptor = Data_Descriptors +2;
! Num_Descriptors = Data_Descriptors + FREELIST_QUEUE_SIZE+1;
/*
* It's probably not really necessary to grab the lock --- if there's
***************
*** 186,197 ****
buf->freeNext = i + 1;
buf->freePrev = i - 1;
!
CLEAR_BUFFERTAG(&(buf->tag));
buf->buf_id = i;
buf->data = MAKE_OFFSET(block);
buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
buf->refcount = 0;
buf->io_in_progress_lock = LWLockAssign();
buf->cntx_lock = LWLockAssign();
--- 190,202 ----
buf->freeNext = i + 1;
buf->freePrev = i - 1;
! buf->freeState=-1;
CLEAR_BUFFERTAG(&(buf->tag));
buf->buf_id = i;
buf->data = MAKE_OFFSET(block);
buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
+
buf->refcount = 0;
buf->io_in_progress_lock = LWLockAssign();
buf->cntx_lock = LWLockAssign();
***************
*** 258,264 ****
size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
/* size of buffer descriptors */
! size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
/* size of data pages */
size += NBuffers * MAXALIGN(BLCKSZ);
--- 263,269 ----
size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
/* size of buffer descriptors */
! size += MAXALIGN((NBuffers + FREELIST_QUEUE_SIZE) * sizeof(BufferDesc));
/* size of data pages */
size += NBuffers * MAXALIGN(BLCKSZ);
***************
*** 272,274 ****
--- 277,289 ----
return size;
}
+
+
+
+
+
+
+
+
+
+
freelist.c.diffapplication/octet-stream; name=freelist.c.diffDownload
*** freelist.c.orig Fri Jun 21 05:29:34 2002
--- freelist.c Wed Jun 18 16:16:28 2003
***************
*** 32,58 ****
#include "storage/ipc.h"
#include "storage/proc.h"
! static BufferDesc *SharedFreeList;
!
/*
* State-checking macros
*/
#define IsInQueue(bf) \
( \
! AssertMacro((bf->freeNext != INVALID_DESCRIPTOR)), \
! AssertMacro((bf->freePrev != INVALID_DESCRIPTOR)), \
AssertMacro((bf->flags & BM_FREE)) \
)
#define IsNotInQueue(bf) \
( \
! AssertMacro((bf->freeNext == INVALID_DESCRIPTOR)), \
! AssertMacro((bf->freePrev == INVALID_DESCRIPTOR)), \
AssertMacro(! (bf->flags & BM_FREE)) \
)
/*
* AddBufferToFreelist
--- 32,64 ----
#include "storage/ipc.h"
#include "storage/proc.h"
+ extern long int AmQueue_List_Descriptor;
+ extern long int A1outQueue_List_Descriptor;
! static BufferDesc *A1QueueFreeList;
! static BufferDesc *A1OutQueueFreeList;
! static BufferDesc *AmQueueFreeList;
/*
* State-checking macros
*/
#define IsInQueue(bf) \
( \
! AssertMacro((bf->freeNext != INVALID_DESCRIPTOR)), \
! AssertMacro((bf->freePrev != INVALID_DESCRIPTOR)), \
AssertMacro((bf->flags & BM_FREE)) \
)
#define IsNotInQueue(bf) \
( \
! AssertMacro((bf->freeNext == INVALID_DESCRIPTOR)), \
! AssertMacro((bf->freePrev == INVALID_DESCRIPTOR)), \
AssertMacro(! (bf->flags & BM_FREE)) \
)
+ static int addAm=0;
+ static int addAll=0;
+ static int addA1=0;
/*
* AddBufferToFreelist
***************
*** 65,82 ****
static void
AddBufferToFreelist(BufferDesc *bf)
{
#ifdef BMTRACE
_bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
#endif /* BMTRACE */
IsNotInQueue(bf);
- /* change bf so it points to inFrontOfNew and its successor */
- bf->freePrev = SharedFreeList->freePrev;
- bf->freeNext = Free_List_Descriptor;
! /* insert new into chain */
! BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
}
--- 71,132 ----
static void
AddBufferToFreelist(BufferDesc *bf)
{
+ BufferDesc *buf;
#ifdef BMTRACE
_bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
#endif /* BMTRACE */
IsNotInQueue(bf);
+ /* in a1in queue or not in */
+ addAll++;
+ if(bf->freeState == -1 ) { /* PUT INTO IN A1IN_QUEUE */
+ /* change bf so it points to inFrontOfNew and its successor */
+ /* insert new into chain */
+ bf->freePrev = A1QueueFreeList->freePrev;
+ bf->freeNext = Free_List_Descriptor;
! A1QueueFreeList->freeState++;
! addA1++;
! bf->freeState=0; /* in A1 queue */
! }else { /* was in a1out queue or am queue*/
! /* change bf so it points to inFrontOfNew and its successor */
! /* add to Am Queue*/
! bf->freePrev = AmQueueFreeList->freePrev;
! bf->freeNext = AmQueue_List_Descriptor;
!
! AmQueueFreeList->freeState++;
! addAm++;
! /* if there are so many entry in Am queue ,moving am queue to a1 queue */
! if (AmQueueFreeList->freeState >= A1QueueFreeList->freeState) {
!
! /* determine buffer removing from Am queue */
!
! buf=&(BufferDescriptors[AmQueueFreeList->freeNext]);
!
! /* remove from Am */
! BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
! BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
! AmQueueFreeList->freeState--;
!
!
! /* add to A1*/
!
! buf->freePrev=A1QueueFreeList->freePrev;
! buf->freeNext=Free_List_Descriptor;
! BufferDescriptors[buf->freeNext].freePrev = buf->buf_id;
! BufferDescriptors[buf->freePrev].freeNext = buf->buf_id;
! A1QueueFreeList->freeState++;
!
! buf->freeState=0; /* in A1 queue */
! }
! bf->freeState=1; /* in Am queue */
! }
! if(addAll%256==0) {
! //elog(NOTICE,"all %d A1 : %d Am : %d",addAll,addA1,addAm);
! }
!
! BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
}
***************
*** 98,107 ****
IsInQueue(buf);
/* remove from freelist queue */
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
!
/* mark buffer as no longer free */
buf->flags &= ~BM_FREE;
}
--- 148,163 ----
IsInQueue(buf);
/* remove from freelist queue */
+
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
! if(buf->freeState==0) {
! A1QueueFreeList->freeState--;
! }else if(buf->freeState==1) {
! AmQueueFreeList->freeState--;
! }
/* mark buffer as no longer free */
buf->flags &= ~BM_FREE;
}
***************
*** 187,215 ****
}
#endif
/*
* GetFreeBuffer() -- get the 'next' buffer from the freelist.
*/
BufferDesc *
GetFreeBuffer(void)
{
! BufferDesc *buf;
!
! if (Free_List_Descriptor == SharedFreeList->freeNext)
! {
! /* queue is empty. All buffers in the buffer pool are pinned. */
! elog(ERROR, "out of free buffers: time to abort!");
! return NULL;
}
- buf = &(BufferDescriptors[SharedFreeList->freeNext]);
! /* remove from freelist queue */
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
buf->flags &= ~(BM_FREE);
return buf;
}
--- 243,284 ----
}
#endif
+
/*
* GetFreeBuffer() -- get the 'next' buffer from the freelist.
*/
BufferDesc *
GetFreeBuffer(void)
{
! BufferDesc *buf;
! if (Free_List_Descriptor != A1QueueFreeList->freeNext) {
! /* A1 queue is not empty. get from it.*/
! buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
! A1QueueFreeList->freeState--;
! }else if (AmQueue_List_Descriptor != AmQueueFreeList->freeNext) {
! /* A1 queue is empty but Am queue is not empty get from it */
! buf = &(BufferDescriptors[AmQueueFreeList->freeNext]);
! AmQueueFreeList->freeState--;
! }else if (Free_List_Descriptor == A1QueueFreeList->freeNext &&
! AmQueue_List_Descriptor != AmQueueFreeList->freeNext) {
! /* all queue is empty. All buffers in the buffer pool are pinned. */
! elog(ERROR, "out of free buffers: time to abort!");
! return NULL;
! }else {
! elog(ERROR, "will never occur");
! return NULL;
}
! /* remove from freelist queue */
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
buf->flags &= ~(BM_FREE);
+
+
+ buf->freeState=-1; /* free */
return buf;
}
***************
*** 224,245 ****
void
InitFreeList(bool init)
{
! SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
if (init)
{
/* we only do this once, normally in the postmaster */
! SharedFreeList->data = INVALID_OFFSET;
! SharedFreeList->flags = 0;
! SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
! SharedFreeList->buf_id = Free_List_Descriptor;
/* insert it into a random spot in the circular queue */
! SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
! SharedFreeList->freePrev = 0;
! BufferDescriptors[SharedFreeList->freeNext].freePrev =
! BufferDescriptors[SharedFreeList->freePrev].freeNext =
Free_List_Descriptor;
}
}
--- 293,330 ----
void
InitFreeList(bool init)
{
! Buffer i;
! A1QueueFreeList = &(BufferDescriptors[Free_List_Descriptor]);
! AmQueueFreeList = &(BufferDescriptors[AmQueue_List_Descriptor]);
if (init)
{
/* we only do this once, normally in the postmaster */
! A1QueueFreeList->data = INVALID_OFFSET;
! A1QueueFreeList->flags = 0;
! A1QueueFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
! A1QueueFreeList->buf_id = Free_List_Descriptor;
/* insert it into a random spot in the circular queue */
! A1QueueFreeList->freeNext = BufferDescriptors[0].freeNext;
! A1QueueFreeList->freePrev = 0;
! // A1QueueFreeList->freeState= ;
! BufferDescriptors[A1QueueFreeList->freeNext].freePrev =
! BufferDescriptors[A1QueueFreeList->freePrev].freeNext =
Free_List_Descriptor;
+ for(i=A1QueueFreeList->freeNext;i!=Free_List_Descriptor;i=BufferDescriptors[i].freeNext) {
+ A1QueueFreeList->freeState++;
+ }
+
+ AmQueueFreeList->data= INVALID_OFFSET;
+ AmQueueFreeList->flags=0;
+ AmQueueFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
+ AmQueueFreeList->buf_id = AmQueue_List_Descriptor;
+
+ /* Am Queue was initilized as empty.*/
+ AmQueueFreeList->freeNext = AmQueue_List_Descriptor;
+ AmQueueFreeList->freePrev = AmQueue_List_Descriptor;
+ AmQueueFreeList->freeState=0;
}
}
***************
*** 254,265 ****
int i;
BufferDesc *buf;
! buf = &(BufferDescriptors[SharedFreeList->freeNext]);
for (i = 0; i < nfree; i++, buf = &(BufferDescriptors[buf->freeNext]))
{
if (!(buf->flags & (BM_FREE)))
{
! if (buf != SharedFreeList)
printf("\tfree list corrupted: %d flags %x\n",
buf->buf_id, buf->flags);
else
--- 339,350 ----
int i;
BufferDesc *buf;
! buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
for (i = 0; i < nfree; i++, buf = &(BufferDescriptors[buf->freeNext]))
{
if (!(buf->flags & (BM_FREE)))
{
! if (buf != A1QueueFreeList)
printf("\tfree list corrupted: %d flags %x\n",
buf->buf_id, buf->flags);
else
***************
*** 271,277 ****
printf("\tfree list links corrupted: %d %ld %ld\n",
buf->buf_id, buf->freePrev, buf->freeNext);
}
! if (buf != SharedFreeList)
printf("\tfree list corrupted: %d-th buffer is %d\n",
nfree, buf->buf_id);
}
--- 356,362 ----
printf("\tfree list links corrupted: %d %ld %ld\n",
buf->buf_id, buf->freePrev, buf->freeNext);
}
! if (buf != A1QueueFreeList)
printf("\tfree list corrupted: %d-th buffer is %d\n",
nfree, buf->buf_id);
}
***************
*** 287,299 ****
{
BufferDesc *buf;
! if (SharedFreeList->freeNext == Free_List_Descriptor)
{
printf("free list is empty.\n");
return;
}
! buf = &(BufferDescriptors[SharedFreeList->freeNext]);
for (;;)
{
int i = (buf - BufferDescriptors);
--- 372,384 ----
{
BufferDesc *buf;
! if (A1QueueFreeList->freeNext == Free_List_Descriptor)
{
printf("free list is empty.\n");
return;
}
! buf = &(BufferDescriptors[A1QueueFreeList->freeNext]);
for (;;)
{
int i = (buf - BufferDescriptors);
Looks good to me --- we will include it in 7.4.
Your patch has been added to the PostgreSQL unapplied patches list at:
http://momjian.postgresql.org/cgi-bin/pgpatches
I will try to apply it within the next 48 hours.
---------------------------------------------------------------------------
Yutaka tanida wrote:
Hi.
I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.http://www.vldb.org/conf/1994/P439.PDF
In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.Do you have any interest about this patch?
--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/
[ Attachment, skipping... ]
[ Attachment, skipping... ]
[ Attachment, skipping... ]
---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073
On Mon, 23 Jun 2003 23:49:17 -0400 (EDT)
Bruce Momjian <pgman@candle.pha.pa.us> wrote:
Looks good to me --- we will include it in 7.4.
Thanks.But please note it is not completed yet. I must implement more ,
and move configurable parameter to postgresql.conf file.
--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/
OK, thanks. I will remove it from the queue, and someone suggested a
different algorithm today:
I was researching on cache replacement strategy as well. 2Q has one
disadvantage see this exellent paper:
http://www.almaden.ibm.com/cs/people/dmodha/#ARC see the paper
"ARC: A Self-Tuning, Low Overhead Replacement Cache" for theory and "One
Up on LRU" for implementation details. ARC requires no tuning and can
switch fast between chaging patterns. Best of all is it is resistant to a
"sequential scan" pattern. and i think it's even easier to implement then
2q :)
---------------------------------------------------------------------------
Yutaka tanida wrote:
On Mon, 23 Jun 2003 23:49:17 -0400 (EDT)
Bruce Momjian <pgman@candle.pha.pa.us> wrote:Looks good to me --- we will include it in 7.4.
Thanks.But please note it is not completed yet. I must implement more ,
and move configurable parameter to postgresql.conf file.--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/---------------------------(end of broadcast)---------------------------
TIP 9: the planner will ignore your desire to choose an index scan if your
joining column's datatypes do not match
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073
Patch removed at author's request.
---------------------------------------------------------------------------
Yutaka tanida wrote:
Hi.
I implement 2Q algorithm to PostgreSQL for buffer management , instead
of LRU.
It's known as low overhead and high performance than LRU. If you have
some interests , see following URL.http://www.vldb.org/conf/1994/P439.PDF
In my test (pgbench -S) , it improves 4% cache hit rate and 2% up
performance comparing from LRU.Do you have any interest about this patch?
--
Yutaka tanida <yutaka@nonsensecorner.com>
http://www.nonsensecorner.com/
[ Attachment, skipping... ]
[ Attachment, skipping... ]
[ Attachment, skipping... ]
---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 359-1001
+ If your life is a hard drive, | 13 Roberts Road
+ Christ can be your backup. | Newtown Square, Pennsylvania 19073