scan_recycle_buffers
Patch to implement buffer cache recycling for scans, as being discussed
on pgsql-hackers.
Applies cleanly to cvstip, passes make installcheck when used by default
for all SeqScans. Tested with scan_recycle_buffers = 1,4,8,16
Should be regarded as WIP. Presumably there are some failure conditions
that require the buffer to be reset; these have not yet been considered.
No docs.
SET scan_recyle_buffers = N
default = 0
8 <= N <= 64 would yield benefits according to earlier results
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Attachments:
scan_recycle_buffers.v1.patchtext/x-patch; charset=UTF-8; name=scan_recycle_buffers.v1.patchDownload
Index: src/backend/executor/nodeSeqscan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSeqscan.c,v
retrieving revision 1.63
diff -c -r1.63 nodeSeqscan.c
*** src/backend/executor/nodeSeqscan.c 5 Jan 2007 22:19:28 -0000 1.63
--- src/backend/executor/nodeSeqscan.c 9 Mar 2007 17:36:22 -0000
***************
*** 24,29 ****
--- 24,30 ----
*/
#include "postgres.h"
+ #include "miscadmin.h"
#include "access/heapam.h"
#include "executor/execdebug.h"
#include "executor/nodeSeqscan.h"
***************
*** 150,155 ****
--- 151,159 ----
currentRelation = ExecOpenScanRelation(estate,
((SeqScan *) node->ps.plan)->scanrelid);
+ if (NScanRecycleBuffers > 0 && RelationGetNumberOfBlocks(currentRelation) > NBuffers)
+ StrategyHintRecycleBuffers(NScanRecycleBuffers);
+
currentScanDesc = heap_beginscan(currentRelation,
estate->es_snapshot,
0,
***************
*** 272,277 ****
--- 276,283 ----
* close the heap relation.
*/
ExecCloseScanRelation(relation);
+
+ StrategyHintRecycleBuffers(0);
}
/* ----------------------------------------------------------------
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.215
diff -c -r1.215 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c 1 Feb 2007 19:10:27 -0000 1.215
--- src/backend/storage/buffer/bufmgr.c 9 Mar 2007 17:36:22 -0000
***************
*** 320,325 ****
--- 320,326 ----
int buf_id;
volatile BufferDesc *buf;
bool valid;
+ bool lock_held = false;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln, blockNum);
***************
*** 384,390 ****
* it would be bad to hold the spinlock while possibly waking up other
* processes.
*/
! buf = StrategyGetBuffer();
Assert(buf->refcount == 0);
--- 385,391 ----
* it would be bad to hold the spinlock while possibly waking up other
* processes.
*/
! buf = StrategyGetBuffer(&lock_held);
Assert(buf->refcount == 0);
***************
*** 395,401 ****
PinBuffer_Locked(buf);
/* Now it's safe to release the freelist lock */
! LWLockRelease(BufFreelistLock);
/*
* If the buffer was dirty, try to write it out. There is a race
--- 396,403 ----
PinBuffer_Locked(buf);
/* Now it's safe to release the freelist lock */
! if (lock_held)
! LWLockRelease(BufFreelistLock);
/*
* If the buffer was dirty, try to write it out. There is a race
***************
*** 884,891 ****
PrivateRefCount[b]--;
if (PrivateRefCount[b] == 0)
{
- bool immed_free_buffer = false;
-
/* I'd better not still hold any locks on the buffer */
Assert(!LWLockHeldByMe(buf->content_lock));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
--- 886,891 ----
***************
*** 899,915 ****
/* Update buffer usage info, unless this is an internal access */
if (normalAccess)
{
if (!strategy_hint_vacuum)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
buf->usage_count++;
}
- else
- {
- /* VACUUM accesses don't bump usage count, instead... */
- if (buf->refcount == 0 && buf->usage_count == 0)
- immed_free_buffer = true;
- }
}
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
--- 899,910 ----
/* Update buffer usage info, unless this is an internal access */
if (normalAccess)
{
+ /* VACUUM accesses don't bump usage count, instead... */
if (!strategy_hint_vacuum)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
buf->usage_count++;
}
}
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
***************
*** 924,937 ****
}
else
UnlockBufHdr(buf);
-
- /*
- * If VACUUM is releasing an otherwise-unused buffer, send it to the
- * freelist for near-term reuse. We put it at the tail so that it
- * won't be used before any invalid buffers that may exist.
- */
- if (immed_free_buffer)
- StrategyFreeBuffer(buf, false);
}
}
--- 919,924 ----
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.58
diff -c -r1.58 freelist.c
*** src/backend/storage/buffer/freelist.c 5 Jan 2007 22:19:37 -0000 1.58
--- src/backend/storage/buffer/freelist.c 9 Mar 2007 17:36:22 -0000
***************
*** 39,47 ****
/* Pointers to shared state */
static BufferStrategyControl *StrategyControl = NULL;
/* Backend-local state about whether currently vacuuming */
bool strategy_hint_vacuum = false;
!
/*
* StrategyGetBuffer
--- 39,57 ----
/* Pointers to shared state */
static BufferStrategyControl *StrategyControl = NULL;
+ /* Buffer Recycling */
+ #define MAX_RECYCLE_BUF_IDS 128
+ static volatile int LocalRecycleBufIds[MAX_RECYCLE_BUF_IDS];
+
+ #define BUF_ID_NOT_SET -1
+ int nextVictimRecycleBufId = BUF_ID_NOT_SET;
+
+ int NScanRecycleBuffers = 0;
+ int ThisScanRecycleBuffers = 0;
+
/* Backend-local state about whether currently vacuuming */
bool strategy_hint_vacuum = false;
! bool strategy_hint_recycle = false;
/*
* StrategyGetBuffer
***************
*** 56,66 ****
* the caller must release that lock once the spinlock is dropped.
*/
volatile BufferDesc *
! StrategyGetBuffer(void)
{
volatile BufferDesc *buf;
int trycounter;
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
/*
--- 66,118 ----
* the caller must release that lock once the spinlock is dropped.
*/
volatile BufferDesc *
! StrategyGetBuffer(bool *lock_held)
{
volatile BufferDesc *buf;
int trycounter;
+ if (strategy_hint_recycle)
+ {
+ /*
+ * Get the next buffer from our local cyclic cache.
+ * Note that we don't need to hold the BufFreelistLock
+ * to get this buffer, because we aren't accessing any
+ * shared memory.
+ *
+ * Run private "clock cycle"
+ */
+ if (++nextVictimRecycleBufId >= ThisScanRecycleBuffers)
+ nextVictimRecycleBufId = 0;
+
+ /*
+ * If that slot hasn't been filled yet, use a new buffer
+ * allocated via the main shared buffer allocation strategy
+ */
+ if (LocalRecycleBufIds[nextVictimRecycleBufId] != BUF_ID_NOT_SET)
+ {
+ buf = &BufferDescriptors[LocalRecycleBufIds[nextVictimRecycleBufId]];
+ /*
+ * If the buffer is pinned we cannot use it in any circumstance.
+ * If usage_count == 0 then the buffer is fair game.
+ *
+ * We also choose this buffer if usage_count == 1. Strictly, this
+ * might sometimes be the wrong thing to do, but we rely on the
+ * high probability that it was this process that last touched
+ * the buffer. We do have to pick a victim, so it may as well be
+ * this one as any of the seldom touched blocks in the buffer pool.
+ */
+ *lock_held = false;
+ LockBufHdr(buf);
+ if (buf->refcount == 0 && buf->usage_count <= 1)
+ return buf;
+ UnlockBufHdr(buf);
+ }
+ }
+
+ /*
+ * If our selected buffer wasn't available, pick another...
+ */
+ *lock_held = true;
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
/*
***************
*** 86,96 ****
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
return buf;
UnlockBufHdr(buf);
}
! /* Nothing on the freelist, so run the "clock sweep" algorithm */
trycounter = NBuffers;
for (;;)
{
--- 138,152 ----
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
+ {
+ if (strategy_hint_recycle)
+ LocalRecycleBufIds[nextVictimRecycleBufId] = buf->buf_id;
return buf;
+ }
UnlockBufHdr(buf);
}
! /* Nothing on the freelist, so run the shared "clock sweep" algorithm */
trycounter = NBuffers;
for (;;)
{
***************
*** 105,111 ****
--- 161,171 ----
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
+ {
+ if (strategy_hint_recycle)
+ LocalRecycleBufIds[nextVictimRecycleBufId] = buf->buf_id;
return buf;
+ }
if (buf->usage_count > 0)
{
buf->usage_count--;
***************
*** 197,204 ****
--- 257,309 ----
StrategyHintVacuum(bool vacuum_active)
{
strategy_hint_vacuum = vacuum_active;
+ if (vacuum_active)
+ StrategyHintRecycleBuffers(NScanRecycleBuffers);
+ else
+ StrategyHintRecycleBuffers(0);
}
+ /*
+ * StrategyHintRecycleOwnBuffers -- tell us whether to recycle buffers
+ * originally filled by this process. This is intended for use by
+ * callers who access blocks in a sequential pattern. Non-sequential
+ * access patterns could be disrupted severely by using this hint.
+ *
+ * Initial data suggests
+ * nRecycleBuffers = 16 for read-only scans (Mark Kirkwood)
+ * nRecycleBuffers = 32 for VACUUMs (Itagaki Takahiro)
+ * probably
+ * nRecycleBuffers >=128 for normal write-intensive tasks
+ * to allow for bgwriter activity
+ */
+ void
+ StrategyHintRecycleBuffers(int nRecycleBuffers)
+ {
+ int i;
+
+ if (nRecycleBuffers < 0 || nRecycleBuffers > NBuffers
+ || nRecycleBuffers > NScanRecycleBuffers)
+ nRecycleBuffers = 0;
+
+ if (nRecycleBuffers > NScanRecycleBuffers)
+ ThisScanRecycleBuffers = NScanRecycleBuffers;
+ else
+ ThisScanRecycleBuffers = nRecycleBuffers;
+
+ if (ThisScanRecycleBuffers > 0)
+ {
+ strategy_hint_recycle = true;
+
+ /* just before 1st element, to allow for preincrement */
+ nextVictimRecycleBufId = -1;
+
+ /* prepare the cyclic buffer */
+ for (i = 0; i < MAX_RECYCLE_BUF_IDS; i++)
+ LocalRecycleBufIds[i] = BUF_ID_NOT_SET;
+ }
+ else
+ strategy_hint_recycle = false;
+ }
/*
* StrategyShmemSize
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.379
diff -c -r1.379 guc.c
*** src/backend/utils/misc/guc.c 6 Mar 2007 02:06:14 -0000 1.379
--- src/backend/utils/misc/guc.c 9 Mar 2007 17:36:24 -0000
***************
*** 1184,1189 ****
--- 1184,1199 ----
},
{
+ {"scan_recycle_buffers", PGC_USERSET, RESOURCES_MEM,
+ gettext_noop("Sets the number of buffers to recycle during scans"),
+ NULL,
+ GUC_UNIT_BLOCKS
+ },
+ &NScanRecycleBuffers,
+ 0, 0, 128, NULL, NULL
+ },
+
+ {
{"port", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
gettext_noop("Sets the TCP port the server listens on."),
NULL
Index: src/include/miscadmin.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/miscadmin.h,v
retrieving revision 1.193
diff -c -r1.193 miscadmin.h
*** src/include/miscadmin.h 1 Mar 2007 14:52:04 -0000 1.193
--- src/include/miscadmin.h 9 Mar 2007 17:36:25 -0000
***************
*** 128,133 ****
--- 128,134 ----
extern DLLIMPORT char *DataDir;
extern DLLIMPORT int NBuffers;
+ extern DLLIMPORT int NScanRecycleBuffers;
extern int MaxBackends;
extern DLLIMPORT int MyProcPid;
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.89
diff -c -r1.89 buf_internals.h
*** src/include/storage/buf_internals.h 5 Jan 2007 22:19:57 -0000 1.89
--- src/include/storage/buf_internals.h 9 Mar 2007 17:36:28 -0000
***************
*** 184,190 ****
*/
/* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(void);
extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
extern int StrategySyncStart(void);
extern Size StrategyShmemSize(void);
--- 184,190 ----
*/
/* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(bool *lock_held);
extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
extern int StrategySyncStart(void);
extern Size StrategyShmemSize(void);
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.102
diff -c -r1.102 bufmgr.h
*** src/include/storage/bufmgr.h 5 Jan 2007 22:19:57 -0000 1.102
--- src/include/storage/bufmgr.h 9 Mar 2007 17:36:28 -0000
***************
*** 133,138 ****
--- 133,140 ----
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock);
extern void DropDatabaseBuffers(Oid dbid);
+ extern void DropAllSharedBuffers(void);
+
#ifdef NOT_USED
extern void PrintPinnedBufs(void);
***************
*** 157,161 ****
--- 159,164 ----
/* in freelist.c */
extern void StrategyHintVacuum(bool vacuum_active);
+ extern void StrategyHintRecycleBuffers(int nRecycleBuffers);
#endif
Simon Riggs wrote:
Patch to implement buffer cache recycling for scans, as being discussed
on pgsql-hackers.
A few questions come to mind:
How does it behave with Jeff's synchronized seq scans patch?
I wonder if calling RelationGetNumberOfBlocks on every seq scan becomes
a performance issue for tiny tables with for example just 1 page. It
performs an lseek, which isn't free.
What happens if multiple backends choose overlapping sets of buffers to
recycle?
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On Fri, 2007-03-09 at 20:08 +0000, Heikki Linnakangas wrote:
Simon Riggs wrote:
Patch to implement buffer cache recycling for scans, as being discussed
on pgsql-hackers.A few questions come to mind:
Good questions. I don't expect this will go through easily, so we need
to examine these thoughts thoroughly.
How does it behave with Jeff's synchronized seq scans patch?
I've offered Jeff lots of support throughout that patch's development
and its a feature I'd like to see. The current synch scan patch relies
upon the cache spoiling effect to gain its benefit. I think that can be
tightened up, so that we can make both work.
Currently synch scans help DSS apps but not OLTP. This patch reduces the
negative effects of VACUUM on OLTP workloads, as well as helping DSS.
I wonder if calling RelationGetNumberOfBlocks on every seq scan becomes
a performance issue for tiny tables with for example just 1 page. It
performs an lseek, which isn't free.
Jeff's patch does this also, for similar reasons.
What happens if multiple backends choose overlapping sets of buffers to
recycle?
They won't. If a buffer is pinned, it will fall out of the the list of
buffers being recycled and not be reused. So they will each tend towards
a unique list of buffers.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Heikki Linnakangas <heikki@enterprisedb.com> writes:
I wonder if calling RelationGetNumberOfBlocks on every seq scan becomes
a performance issue for tiny tables with for example just 1 page. It
performs an lseek, which isn't free.
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.
regards, tom lane
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
Heikki Linnakangas <heikki@enterprisedb.com> writes:
I wonder if calling RelationGetNumberOfBlocks on every seq scan becomes
a performance issue for tiny tables with for example just 1 page. It
performs an lseek, which isn't free.We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.
It should be possible to pass that down from the planner to the
executor, in certain cases. Or at least pass down the possibility that
such a check might be worthwhile.
Another approach might be to make the call after the first ~10 I/Os on a
SeqScan, after which an lseek will be just noise. That way an
all-in-cache scan would never need it at all. Thats easy to arrange
because the hint is invoked from the exec nodes themselves.
We probably need to get some measurements for the main benefit of the
patch before we look further into those thoughts.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.
It should be possible to pass that down from the planner to the
executor, in certain cases.
Huh? See HeapScanDesc->rs_nblocks.
regards, tom lane
On Fri, 2007-03-09 at 20:37 +0000, Simon Riggs wrote:
I wonder if calling RelationGetNumberOfBlocks on every seq scan becomes
a performance issue for tiny tables with for example just 1 page. It
performs an lseek, which isn't free.Jeff's patch does this also, for similar reasons.
As Tom pointed out, the value is already in memory by the time it gets
to my code. My code just reads that value from memory.
Regards,
Jeff Davis
On Fri, 2007-03-09 at 20:08 +0000, Heikki Linnakangas wrote:
Simon Riggs wrote:
Patch to implement buffer cache recycling for scans, as being discussed
on pgsql-hackers.A few questions come to mind:
How does it behave with Jeff's synchronized seq scans patch?
I will test it and post my results. I would expect that the CPU usage
will increase, but it might not make a big difference in the overall
cache hit rate if you count OS buffer cache hits.
Regards,
Jeff Davis
On Fri, 2007-03-09 at 18:05 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.It should be possible to pass that down from the planner to the
executor, in certain cases.Huh? See HeapScanDesc->rs_nblocks.
Many thanks.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
On Sat, 2007-03-10 at 07:59 +0000, Simon Riggs wrote:
On Fri, 2007-03-09 at 18:05 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.It should be possible to pass that down from the planner to the
executor, in certain cases.Huh? See HeapScanDesc->rs_nblocks.
Many thanks.
New patch enclosed, implementation as you've requested.
Not ready to apply yet, but good for testing.
COPY command now also uses this hint, to allow test results and
discussion. Others could also, perhaps needing different values.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Attachments:
scan_recycle_buffers.v2.patchtext/x-patch; charset=UTF-8; name=scan_recycle_buffers.v2.patchDownload
Index: src/backend/commands/copy.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.277
diff -c -r1.277 copy.c
*** src/backend/commands/copy.c 3 Mar 2007 19:32:54 -0000 1.277
--- src/backend/commands/copy.c 10 Mar 2007 08:45:50 -0000
***************
*** 1865,1870 ****
--- 1865,1872 ----
&& !XLogArchivingActive())
use_wal = false;
+ StrategyHintRecycleBuffers(NScanRecycleBuffers);
+
/* Initialize state variables */
cstate->fe_eof = false;
cstate->eol_type = EOL_UNKNOWN;
***************
*** 2115,2120 ****
--- 2117,2124 ----
}
}
+ StrategyHintRecycleBuffers(0);
+
/*
* If we skipped writing WAL for heaps, then we need to sync
*/
Index: src/backend/executor/nodeSeqscan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSeqscan.c,v
retrieving revision 1.63
diff -c -r1.63 nodeSeqscan.c
*** src/backend/executor/nodeSeqscan.c 5 Jan 2007 22:19:28 -0000 1.63
--- src/backend/executor/nodeSeqscan.c 10 Mar 2007 08:45:51 -0000
***************
*** 24,29 ****
--- 24,30 ----
*/
#include "postgres.h"
+ #include "miscadmin.h"
#include "access/heapam.h"
#include "executor/execdebug.h"
#include "executor/nodeSeqscan.h"
***************
*** 154,159 ****
--- 155,166 ----
estate->es_snapshot,
0,
NULL);
+ /*
+ * For scans larger than shared buffer cache enable
+ * buffer recycling, if the user has requested it
+ */
+ if (currentScanDesc->rs_nblocks > NBuffers && NScanRecycleBuffers > 0)
+ StrategyHintRecycleBuffers(NScanRecycleBuffers);
node->ss_currentRelation = currentRelation;
node->ss_currentScanDesc = currentScanDesc;
***************
*** 272,277 ****
--- 279,286 ----
* close the heap relation.
*/
ExecCloseScanRelation(relation);
+
+ StrategyHintRecycleBuffers(0);
}
/* ----------------------------------------------------------------
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.215
diff -c -r1.215 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c 1 Feb 2007 19:10:27 -0000 1.215
--- src/backend/storage/buffer/bufmgr.c 10 Mar 2007 08:45:55 -0000
***************
*** 320,325 ****
--- 320,326 ----
int buf_id;
volatile BufferDesc *buf;
bool valid;
+ bool lock_held = false;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln, blockNum);
***************
*** 384,390 ****
* it would be bad to hold the spinlock while possibly waking up other
* processes.
*/
! buf = StrategyGetBuffer();
Assert(buf->refcount == 0);
--- 385,391 ----
* it would be bad to hold the spinlock while possibly waking up other
* processes.
*/
! buf = StrategyGetBuffer(&lock_held);
Assert(buf->refcount == 0);
***************
*** 395,401 ****
PinBuffer_Locked(buf);
/* Now it's safe to release the freelist lock */
! LWLockRelease(BufFreelistLock);
/*
* If the buffer was dirty, try to write it out. There is a race
--- 396,403 ----
PinBuffer_Locked(buf);
/* Now it's safe to release the freelist lock */
! if (lock_held)
! LWLockRelease(BufFreelistLock);
/*
* If the buffer was dirty, try to write it out. There is a race
***************
*** 884,891 ****
PrivateRefCount[b]--;
if (PrivateRefCount[b] == 0)
{
- bool immed_free_buffer = false;
-
/* I'd better not still hold any locks on the buffer */
Assert(!LWLockHeldByMe(buf->content_lock));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
--- 886,891 ----
***************
*** 899,915 ****
/* Update buffer usage info, unless this is an internal access */
if (normalAccess)
{
if (!strategy_hint_vacuum)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
buf->usage_count++;
}
- else
- {
- /* VACUUM accesses don't bump usage count, instead... */
- if (buf->refcount == 0 && buf->usage_count == 0)
- immed_free_buffer = true;
- }
}
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
--- 899,910 ----
/* Update buffer usage info, unless this is an internal access */
if (normalAccess)
{
+ /* VACUUM accesses don't bump usage count, instead... */
if (!strategy_hint_vacuum)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
buf->usage_count++;
}
}
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
***************
*** 924,937 ****
}
else
UnlockBufHdr(buf);
-
- /*
- * If VACUUM is releasing an otherwise-unused buffer, send it to the
- * freelist for near-term reuse. We put it at the tail so that it
- * won't be used before any invalid buffers that may exist.
- */
- if (immed_free_buffer)
- StrategyFreeBuffer(buf, false);
}
}
--- 919,924 ----
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.58
diff -c -r1.58 freelist.c
*** src/backend/storage/buffer/freelist.c 5 Jan 2007 22:19:37 -0000 1.58
--- src/backend/storage/buffer/freelist.c 10 Mar 2007 08:45:55 -0000
***************
*** 39,47 ****
/* Pointers to shared state */
static BufferStrategyControl *StrategyControl = NULL;
/* Backend-local state about whether currently vacuuming */
bool strategy_hint_vacuum = false;
!
/*
* StrategyGetBuffer
--- 39,57 ----
/* Pointers to shared state */
static BufferStrategyControl *StrategyControl = NULL;
+ /* Buffer Recycling */
+ #define MAX_RECYCLE_BUF_IDS 128
+ static volatile int LocalRecycleBufIds[MAX_RECYCLE_BUF_IDS];
+
+ #define BUF_ID_NOT_SET -1
+ int nextVictimRecycleBufId = BUF_ID_NOT_SET;
+
+ int NScanRecycleBuffers = 0;
+ int ThisScanRecycleBuffers = 0;
+
/* Backend-local state about whether currently vacuuming */
bool strategy_hint_vacuum = false;
! bool strategy_hint_recycle = false;
/*
* StrategyGetBuffer
***************
*** 56,66 ****
* the caller must release that lock once the spinlock is dropped.
*/
volatile BufferDesc *
! StrategyGetBuffer(void)
{
volatile BufferDesc *buf;
int trycounter;
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
/*
--- 66,118 ----
* the caller must release that lock once the spinlock is dropped.
*/
volatile BufferDesc *
! StrategyGetBuffer(bool *lock_held)
{
volatile BufferDesc *buf;
int trycounter;
+ if (strategy_hint_recycle)
+ {
+ /*
+ * Get the next buffer from our local cyclic cache.
+ * Note that we don't need to hold the BufFreelistLock
+ * to get this buffer, because we aren't accessing any
+ * shared memory.
+ *
+ * Run private "clock cycle"
+ */
+ if (++nextVictimRecycleBufId >= ThisScanRecycleBuffers)
+ nextVictimRecycleBufId = 0;
+
+ /*
+ * If that slot hasn't been filled yet, use a new buffer
+ * allocated via the main shared buffer allocation strategy
+ */
+ if (LocalRecycleBufIds[nextVictimRecycleBufId] != BUF_ID_NOT_SET)
+ {
+ buf = &BufferDescriptors[LocalRecycleBufIds[nextVictimRecycleBufId]];
+ /*
+ * If the buffer is pinned we cannot use it in any circumstance.
+ * If usage_count == 0 then the buffer is fair game.
+ *
+ * We also choose this buffer if usage_count == 1. Strictly, this
+ * might sometimes be the wrong thing to do, but we rely on the
+ * high probability that it was this process that last touched
+ * the buffer. We do have to pick a victim, so it may as well be
+ * this one as any of the seldom touched blocks in the buffer pool.
+ */
+ *lock_held = false;
+ LockBufHdr(buf);
+ if (buf->refcount == 0 && buf->usage_count <= 1)
+ return buf;
+ UnlockBufHdr(buf);
+ }
+ }
+
+ /*
+ * If our selected buffer wasn't available, pick another...
+ */
+ *lock_held = true;
LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
/*
***************
*** 86,96 ****
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
return buf;
UnlockBufHdr(buf);
}
! /* Nothing on the freelist, so run the "clock sweep" algorithm */
trycounter = NBuffers;
for (;;)
{
--- 138,152 ----
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
+ {
+ if (strategy_hint_recycle)
+ LocalRecycleBufIds[nextVictimRecycleBufId] = buf->buf_id;
return buf;
+ }
UnlockBufHdr(buf);
}
! /* Nothing on the freelist, so run the shared "clock sweep" algorithm */
trycounter = NBuffers;
for (;;)
{
***************
*** 105,111 ****
--- 161,171 ----
*/
LockBufHdr(buf);
if (buf->refcount == 0 && buf->usage_count == 0)
+ {
+ if (strategy_hint_recycle)
+ LocalRecycleBufIds[nextVictimRecycleBufId] = buf->buf_id;
return buf;
+ }
if (buf->usage_count > 0)
{
buf->usage_count--;
***************
*** 197,204 ****
--- 257,309 ----
StrategyHintVacuum(bool vacuum_active)
{
strategy_hint_vacuum = vacuum_active;
+ if (vacuum_active)
+ StrategyHintRecycleBuffers(NScanRecycleBuffers);
+ else
+ StrategyHintRecycleBuffers(0);
}
+ /*
+ * StrategyHintRecycleOwnBuffers -- tell us whether to recycle buffers
+ * originally filled by this process. This is intended for use by
+ * callers who access blocks in a sequential pattern. Non-sequential
+ * access patterns could be disrupted severely by using this hint.
+ *
+ * Initial data suggests
+ * nRecycleBuffers = 16 for read-only scans (Mark Kirkwood)
+ * nRecycleBuffers = 32 for VACUUMs (Itagaki Takahiro)
+ * probably
+ * nRecycleBuffers >=128 for normal write-intensive tasks
+ * to allow for bgwriter activity
+ */
+ void
+ StrategyHintRecycleBuffers(int nRecycleBuffers)
+ {
+ int i;
+
+ if (nRecycleBuffers < 0 || nRecycleBuffers > NBuffers
+ || nRecycleBuffers > NScanRecycleBuffers)
+ nRecycleBuffers = 0;
+
+ if (nRecycleBuffers > NScanRecycleBuffers)
+ ThisScanRecycleBuffers = NScanRecycleBuffers;
+ else
+ ThisScanRecycleBuffers = nRecycleBuffers;
+
+ if (ThisScanRecycleBuffers > 0)
+ {
+ strategy_hint_recycle = true;
+
+ /* just before 1st element, to allow for preincrement */
+ nextVictimRecycleBufId = -1;
+
+ /* prepare the cyclic buffer */
+ for (i = 0; i < MAX_RECYCLE_BUF_IDS; i++)
+ LocalRecycleBufIds[i] = BUF_ID_NOT_SET;
+ }
+ else
+ strategy_hint_recycle = false;
+ }
/*
* StrategyShmemSize
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.379
diff -c -r1.379 guc.c
*** src/backend/utils/misc/guc.c 6 Mar 2007 02:06:14 -0000 1.379
--- src/backend/utils/misc/guc.c 10 Mar 2007 08:46:02 -0000
***************
*** 1184,1189 ****
--- 1184,1199 ----
},
{
+ {"scan_recycle_buffers", PGC_USERSET, RESOURCES_MEM,
+ gettext_noop("Sets the number of buffers to recycle during scans"),
+ NULL,
+ GUC_UNIT_BLOCKS
+ },
+ &NScanRecycleBuffers,
+ 0, 0, 128, NULL, NULL
+ },
+
+ {
{"port", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
gettext_noop("Sets the TCP port the server listens on."),
NULL
Index: src/include/miscadmin.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/miscadmin.h,v
retrieving revision 1.193
diff -c -r1.193 miscadmin.h
*** src/include/miscadmin.h 1 Mar 2007 14:52:04 -0000 1.193
--- src/include/miscadmin.h 10 Mar 2007 08:46:02 -0000
***************
*** 128,133 ****
--- 128,134 ----
extern DLLIMPORT char *DataDir;
extern DLLIMPORT int NBuffers;
+ extern DLLIMPORT int NScanRecycleBuffers;
extern int MaxBackends;
extern DLLIMPORT int MyProcPid;
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.89
diff -c -r1.89 buf_internals.h
*** src/include/storage/buf_internals.h 5 Jan 2007 22:19:57 -0000 1.89
--- src/include/storage/buf_internals.h 10 Mar 2007 08:46:14 -0000
***************
*** 184,190 ****
*/
/* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(void);
extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
extern int StrategySyncStart(void);
extern Size StrategyShmemSize(void);
--- 184,190 ----
*/
/* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(bool *lock_held);
extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
extern int StrategySyncStart(void);
extern Size StrategyShmemSize(void);
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.102
diff -c -r1.102 bufmgr.h
*** src/include/storage/bufmgr.h 5 Jan 2007 22:19:57 -0000 1.102
--- src/include/storage/bufmgr.h 10 Mar 2007 08:46:14 -0000
***************
*** 133,138 ****
--- 133,140 ----
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock);
extern void DropDatabaseBuffers(Oid dbid);
+ extern void DropAllSharedBuffers(void);
+
#ifdef NOT_USED
extern void PrintPinnedBufs(void);
***************
*** 157,161 ****
--- 159,164 ----
/* in freelist.c */
extern void StrategyHintVacuum(bool vacuum_active);
+ extern void StrategyHintRecycleBuffers(int nRecycleBuffers);
#endif
"Simon Riggs" <simon@2ndquadrant.com> writes:
COPY command now also uses this hint, to allow test results and
discussion. Others could also, perhaps needing different values.
Hm. It occurs to me that different commands may want different size buffer
rings.
As I understand it the reason your buffer rings are more than just a single
target buffer are:
1) For sequential scans because it creates a window for synchronized
sequential scans.
2) For dirty buffers because the dirty evicting the dirty buffer will force an
XLogFlush and we want to give a chance for the WAL pointer to advance past
the buffer's LSN. Ie, to allow other transactions to do our fsync for us
since it won't cost them much extra if anything.
Can you log whenever your ring buffer finds a provides a dirty buffer whose
LSN requires syncing the WAL log? That will help you figure out how large a
ring buffer you need to guarantee property 2.
--
Gregory Stark
EnterpriseDB http://www.enterprisedb.com
Simon Riggs wrote:
On Sat, 2007-03-10 at 07:59 +0000, Simon Riggs wrote:
On Fri, 2007-03-09 at 18:05 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.It should be possible to pass that down from the planner to the
executor, in certain cases.Huh? See HeapScanDesc->rs_nblocks.
Many thanks.
New patch enclosed, implementation as you've requested.
Not ready to apply yet, but good for testing.
A quick test using the setup for "Buffer cache is not scan resistant"
thread:
Firstly vanilla 8.3 from 20070310:
Shared Buffers Elapsed vmstat IO rate
-------------- ------- --------------
400MB 101 s 122 MB/s
128KB 79 s 155 MB/s [1]I'm not seeing 166 MB/s like previous 8.2.3 data, however 8.3 PGDATA is located further toward the end of the disk array - which I suspect is limiting the IO rate a little.
Now apply cycle scan v2:
Shared Buffers Scan_recycle_buffers Elapsed vmstat IO rate
-------------- -------------------- ------- -------------
400MB 0 101 s 122 MB/s
400MB 8 78 s 155 MB/s
400MB 16 77 s 155 MB/s
400MB 32 78 s 155 MB/s
400MB 64 82 s 148 MB/s
400MB 128 93 s 128 MB/s
Certainly seems to have the desired effect!
Cheers
Mark
[1]: I'm not seeing 166 MB/s like previous 8.2.3 data, however 8.3 PGDATA is located further toward the end of the disk array - which I suspect is limiting the IO rate a little.
is located further toward the end of the disk array - which I suspect is
limiting the IO rate a little.
On Sat, 2007-03-10 at 09:42 +0000, Gregory Stark wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
COPY command now also uses this hint, to allow test results and
discussion. Others could also, perhaps needing different values.Hm. It occurs to me that different commands may want different size buffer
rings.
Yes, thats noted in comments in the patch. scan_recycle_buffers was
designed to allow us to test which types of scan benefit from which
settings.
As I understand it the reason your buffer rings are more than just a single
target buffer are:1) For sequential scans because it creates a window for synchronized
sequential scans.2) For dirty buffers because the dirty evicting the dirty buffer will force an
XLogFlush and we want to give a chance for the WAL pointer to advance past
the buffer's LSN. Ie, to allow other transactions to do our fsync for us
since it won't cost them much extra if anything.Can you log whenever your ring buffer finds a provides a dirty buffer whose
LSN requires syncing the WAL log? That will help you figure out how large a
ring buffer you need to guarantee property 2.
Hmm, again your thoughts mirrored my own, but this time you're slightly
ahead of me. I was just looking into the possibility of adaptive scans,
to allow synch scans to force the scan_recycle_buffer size higher. I
think having the size of the buffer vary during a scan seems sensible
also, within min and max limits.
I'll post some further thoughts tomorrow.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
On Sat, 2007-03-10 at 23:26 +1300, Mark Kirkwood wrote:
Simon Riggs wrote:
New patch enclosed, implementation as you've requested.
Not ready to apply yet, but good for testing.
A quick test using the setup for "Buffer cache is not scan resistant"
thread:Firstly vanilla 8.3 from 20070310:
Shared Buffers Elapsed vmstat IO rate
-------------- ------- --------------
400MB 101 s 122 MB/s
128KB 79 s 155 MB/s [1]Now apply cycle scan v2:
Shared Buffers Scan_recycle_buffers Elapsed vmstat IO rate
-------------- -------------------- ------- -------------
400MB 0 101 s 122 MB/s
400MB 8 78 s 155 MB/s
400MB 16 77 s 155 MB/s
400MB 32 78 s 155 MB/s
400MB 64 82 s 148 MB/s
400MB 128 93 s 128 MB/sCertainly seems to have the desired effect!
Cheers
Mark
[1] I'm not seeing 166 MB/s like previous 8.2.3 data, however 8.3 PGDATA
is located further toward the end of the disk array - which I suspect is
limiting the IO rate a little.
That's good news, thanks very much for testing that.
Before we can claim success, we need a few more tests on VACUUM, COPY
and a null test case to show it doesn't effect typical workloads, except
to improve vacuuming. I'll see if we can arrange those at EDB on a
reasonable size system.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Where is the final version of this patch? What patches are stuck in the
patch moderator queue?
---------------------------------------------------------------------------
Simon Riggs wrote:
On Sat, 2007-03-10 at 07:59 +0000, Simon Riggs wrote:
On Fri, 2007-03-09 at 18:05 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Fri, 2007-03-09 at 16:45 -0500, Tom Lane wrote:
We do that anyway; but certainly Simon's patch ought not be injecting
an additional one.It should be possible to pass that down from the planner to the
executor, in certain cases.Huh? See HeapScanDesc->rs_nblocks.
Many thanks.
New patch enclosed, implementation as you've requested.
Not ready to apply yet, but good for testing.
COPY command now also uses this hint, to allow test results and
discussion. Others could also, perhaps needing different values.--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
[ Attachment, skipping... ]
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend
--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://www.enterprisedb.com
+ If your life is a hard drive, Christ can be your backup. +
On Mon, 2007-04-02 at 19:10 -0400, Bruce Momjian wrote:
Where is the final version of this patch? What patches are stuck in the
patch moderator queue?
We already discussed the dependency that exists with this patch and you
accepted that.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Simon Riggs wrote:
On Mon, 2007-04-02 at 19:10 -0400, Bruce Momjian wrote:
Where is the final version of this patch? What patches are stuck in the
patch moderator queue?We already discussed the dependency that exists with this patch and you
accepted that.
Oh, that was the patch. I forgot. I am getting confused over which
patches are finished by the authors, and which are on hold because of
merge issues or open community discussion issues.
Rather than ask if patches are "completed", I think "finished" is a
better word, meaning the author has finished working on it, and it now
up to the community on how to proceed.
--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://www.enterprisedb.com
+ If your life is a hard drive, Christ can be your backup. +