[RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Started by Andres Freundalmost 12 years ago28 messages
#1Andres Freund
andres@2ndquadrant.com
1 attachment(s)

Hi,

I've been annoyed at the amount of memory used by the backend local
PrivateRefCount array for a couple of reasons:

a) The performance impact of AtEOXact_Buffers() on Assert() enabled
builds is really, really annoying.
b) On larger nodes, the L1/2/3 cache impact of randomly accessing
several megabyte big array at a high frequency is noticeable. I've
seen the access to that to be the primary (yes, really) source of
pipeline stalls.
c) On nodes with significant shared_memory the sum of the per-backend
arrays is a significant amount of memory, that could very well be
used more beneficially.

So what I have done in the attached proof of concept is to have a small
(8 currently) array of (buffer, pincount) that's searched linearly when
the refcount of a buffer is needed. When more than 8 buffers are pinned
a hashtable is used to lookup the values.

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

Alternatively we could just get rid of the idea of tracking this per
backend, relying on tracking via resource managers...

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

0001-Make-backend-local-tracking-of-buffer-pins-more-effi.patchtext/x-patch; charset=us-asciiDownload
>From 433f248c0f4c3e3d43d1cc955354e5dd5cddfcea Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 20 Mar 2014 21:46:34 +0100
Subject: [PATCH] Make backend local tracking of buffer pins more efficient.

---
 src/backend/storage/buffer/buf_init.c |  25 ---
 src/backend/storage/buffer/bufmgr.c   | 346 ++++++++++++++++++++++++++++------
 src/include/storage/bufmgr.h          |  19 --
 3 files changed, 290 insertions(+), 100 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e187242..3b2432d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -130,31 +130,6 @@ InitBufferPool(void)
 }
 
 /*
- * Initialize access to shared buffer pool
- *
- * This is called during backend startup (whether standalone or under the
- * postmaster).  It sets up for this backend's access to the already-existing
- * buffer pool.
- *
- * NB: this is called before InitProcess(), so we do not have a PGPROC and
- * cannot do LWLockAcquire; hence we can't actually access stuff in
- * shared memory yet.  We are only initializing local data here.
- * (See also InitBufferPoolBackend, over in bufmgr.c.)
- */
-void
-InitBufferPoolAccess(void)
-{
-	/*
-	 * Allocate and zero local arrays of per-buffer info.
-	 */
-	PrivateRefCount = (int32 *) calloc(NBuffers, sizeof(int32));
-	if (!PrivateRefCount)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-}
-
-/*
  * BufferShmemSize
  *
  * compute the size of shared memory for the buffer pool including
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 19eecab..113b7ed 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -86,6 +86,175 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
+typedef struct PrivateRefCount
+{
+	Buffer buffer;
+	int32 refcount;
+} PrivateRefCount;
+
+/* one full cacheline */
+#define REFCOUNT_ARRAY_ENTRIES 8
+
+/*
+ * Backend-Private refcount management.
+ */
+static struct PrivateRefCount PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+
+static PrivateRefCount* GetPrivateRefCountEntry(Buffer buffer, bool create);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCount *ref);
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if create = false is passed and the buffer doesn't have a
+ * PrivateRefCount entry; allocates a new PrivateRefCount entry if currently
+ * none exists and create = true is passed.
+ *
+ * When a returned refcount entry isn't used anymore it has to be forgotten,
+ * using ForgetPrivateRefCountEntry().
+ *
+ * Only works for shared buffers.
+ */
+static PrivateRefCount*
+GetPrivateRefCountEntry(Buffer buffer, bool create)
+{
+	PrivateRefCount *res;
+	PrivateRefCount *free = NULL;
+
+	int i;
+	bool found = false;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * First search for references in the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer == buffer)
+			return res;
+
+		/* Remember where to put a new refcount, should it become necessary. */
+		if (create && free == NULL && res->buffer == InvalidBuffer)
+			free = res;
+	}
+
+	/*
+	 * By here we know that the buffer, if already pinned, isn't residing in
+	 * the array.
+	 */
+
+	res = NULL;
+	found = false;
+
+	/*
+	 * Search in the hashtable if either we haven't found the target buffer
+	 * yet and it already has entries, or if there wasn't a free entry in the
+	 * local array.
+	 */
+	if (PrivateRefCountOverflowed > 0 || (create && free == NULL))
+	{
+		res = hash_search(PrivateRefCountHash,
+						  (void *) &buffer,
+						  (create && free == NULL)  ? HASH_ENTER : HASH_FIND,
+						  &found);
+	}
+
+	if (found)
+	{
+		/* TODO: could relocate to `free' here */
+	}
+	else if (res)
+	{
+		/* a new refcount entry, initialize */
+		PrivateRefCountOverflowed++;
+		res->refcount = 0;
+	}
+	else if (create)
+	{
+		/* buffer wasn't in the hash and we have a free array entry */
+		Assert(free != NULL);
+		free->buffer = buffer;
+		free->refcount = 0;
+		res = free;
+	}
+
+	return res;
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+	PrivateRefCount *ref;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	ref = GetPrivateRefCountEntry(buffer, false);
+
+	if (ref == NULL)
+		return 0;
+	return ref->refcount;
+}
+
+/*
+ * Stop tracking the refcount of the buffer ref is tracking the refcount
+ * for. Nono, there's no circularity here.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCount *ref)
+{
+	Assert(ref->refcount == 0);
+
+	if (ref >= &PrivateRefCountArray[0] &&
+		ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+	{
+		ref->buffer = InvalidBuffer;
+	}
+	else
+	{
+		bool found;
+		Buffer buffer = ref->buffer;
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+	}
+}
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+	(GetPrivateRefCount(bufnum) > 0) \
+)
+
 
 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
@@ -940,7 +1109,7 @@ retry:
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
-		if (PrivateRefCount[buf->buf_id] != 0)
+		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
 		goto retry;
@@ -999,7 +1168,7 @@ MarkBufferDirty(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -1046,9 +1215,9 @@ ReleaseAndReadBuffer(Buffer buffer,
 
 	if (BufferIsValid(buffer))
 	{
+		Assert(BufferIsPinned(buffer));
 		if (BufferIsLocal(buffer))
 		{
-			Assert(LocalRefCount[-buffer - 1] > 0);
 			bufHdr = &LocalBufferDescriptors[-buffer - 1];
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
@@ -1059,7 +1228,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 		}
 		else
 		{
-			Assert(PrivateRefCount[buffer - 1] > 0);
 			bufHdr = &BufferDescriptors[buffer - 1];
 			/* we have pin, so it's ok to examine tag without spinlock */
 			if (bufHdr->tag.blockNum == blockNum &&
@@ -1089,15 +1257,18 @@ ReleaseAndReadBuffer(Buffer buffer,
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns TRUE if buffer is BM_VALID, else FALSE.	This provision allows
- * some callers to avoid an extra spinlock cycle.
+ * some callradix_tree_inserters to avoid an extra spinlock cycle.
  */
 static bool
 PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 {
 	int			b = buf->buf_id;
 	bool		result;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 	{
 		LockBufHdr(buf);
 		buf->refcount++;
@@ -1119,8 +1290,9 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		/* If we previously pinned the buffer, it must surely be valid */
 		result = true;
 	}
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 	return result;
@@ -1143,12 +1315,15 @@ static void
 PinBuffer_Locked(volatile BufferDesc *buf)
 {
 	int			b = buf->buf_id;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 		buf->refcount++;
 	UnlockBufHdr(buf);
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 }
@@ -1165,14 +1340,18 @@ static void
 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 {
 	int			b = buf->buf_id;
+	PrivateRefCount *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, false);
+	Assert(ref != NULL);
 
 	if (fixOwner)
 		ResourceOwnerForgetBuffer(CurrentResourceOwner,
 								  BufferDescriptorGetBuffer(buf));
 
-	Assert(PrivateRefCount[b] > 0);
-	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0)
+	Assert(ref->refcount > 0);
+	ref->refcount--;
+	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
@@ -1197,6 +1376,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		}
 		else
 			UnlockBufHdr(buf);
+
+		ForgetPrivateRefCountEntry(ref);
 	}
 }
 
@@ -1700,36 +1881,95 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	return result | BUF_WRITTEN;
 }
 
-
 /*
- *		AtEOXact_Buffers - clean up at end of transaction.
+ * Helper Routine for AtProcExit_Buffers() and AtEOXact_Buffers().
  *
- *		As of PostgreSQL 8.0, buffer pins should get released by the
- *		ResourceOwner mechanism.  This routine is just a debugging
- *		cross-check that no pins remain.
+ * Check whether any buffer pins are still held by this backend, but only if
+ * assertions are enabled.
  */
-void
-AtEOXact_Buffers(bool isCommit)
+static void
+AssertNoPinnedBuffers(void)
 {
 #ifdef USE_ASSERT_CHECKING
 	if (assert_enabled)
 	{
 		int			RefCountErrors = 0;
-		Buffer		b;
+		PrivateRefCount *res;
+		int			i;
 
-		for (b = 1; b <= NBuffers; b++)
+		/* check the array */
+		for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
 		{
-			if (PrivateRefCount[b - 1] != 0)
+			res = &PrivateRefCountArray[i];
+
+			if (res->buffer != InvalidBuffer)
+			{
+				PrintBufferLeakWarning(res->buffer);
+				RefCountErrors++;
+			}
+		}
+
+		/* if neccessary search the hash */
+		if (PrivateRefCountOverflowed)
+		{
+			HASH_SEQ_STATUS hstat;
+			hash_seq_init(&hstat, PrivateRefCountHash);
+			while ((res = (PrivateRefCount *) hash_seq_search(&hstat)) != NULL)
 			{
-				PrintBufferLeakWarning(b);
+				PrintBufferLeakWarning(res->buffer);
 				RefCountErrors++;
 			}
+
 		}
+
 		Assert(RefCountErrors == 0);
 	}
 #endif
+}
+
+/*
+ *		AtEOXact_Buffers - clean up at end of transaction.
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
+ */
+void
+AtEOXact_Buffers(bool isCommit)
+{
+	AssertNoPinnedBuffers();
 
 	AtEOXact_LocalBuffers(isCommit);
+
+	Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster).  It sets up for this backend's access to the already-existing
+ * buffer pool.
+ *
+ * NB: this is called before InitProcess(), so we do not have a PGPROC and
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
+ * shared memory yet.  We are only initializing local data here.
+ * (See also InitBufferPoolBackend)
+ */
+void
+InitBufferPoolAccess(void)
+{
+	HASHCTL		hash_ctl;
+
+	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(int32);
+	hash_ctl.entrysize = sizeof(PrivateRefCountArray);
+	hash_ctl.hash = oid_hash; /* a bit more efficient than tag_hash */
+
+	PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION);
 }
 
 /*
@@ -1757,26 +1997,12 @@ AtProcExit_Buffers(int code, Datum arg)
 	AbortBufferIO();
 	UnlockBuffers();
 
-#ifdef USE_ASSERT_CHECKING
-	if (assert_enabled)
-	{
-		int			RefCountErrors = 0;
-		Buffer		b;
-
-		for (b = 1; b <= NBuffers; b++)
-		{
-			if (PrivateRefCount[b - 1] != 0)
-			{
-				PrintBufferLeakWarning(b);
-				RefCountErrors++;
-			}
-		}
-		Assert(RefCountErrors == 0);
-	}
-#endif
+	AssertNoPinnedBuffers();
 
 	/* localbuf.c needs a chance too */
 	AtProcExit_LocalBuffers();
+
+	Assert(PrivateRefCountOverflowed == 0);
 }
 
 /*
@@ -1800,7 +2026,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	else
 	{
 		buf = &BufferDescriptors[buffer - 1];
-		loccount = PrivateRefCount[buffer - 1];
+		loccount = GetPrivateRefCount(buffer);
 		backend = InvalidBackendId;
 	}
 
@@ -2340,7 +2566,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, PrivateRefCount[i]);
+			 buf->refcount, GetPrivateRefCount(i));
 	}
 }
 #endif
@@ -2354,7 +2580,7 @@ PrintPinnedBufs(void)
 
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
-		if (PrivateRefCount[i] > 0)
+		if (GetPrivateRefCount(i + 1) > 0)
 		{
 			/* theoretically we should lock the bufhdr here */
 			elog(LOG,
@@ -2363,7 +2589,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpath(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, PrivateRefCount[i]);
+				 buf->refcount, GetPrivateRefCount(i + 1));
 		}
 	}
 }
@@ -2520,6 +2746,7 @@ void
 ReleaseBuffer(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	PrivateRefCount *ref;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -2535,10 +2762,12 @@ ReleaseBuffer(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	ref = GetPrivateRefCountEntry(buffer, false);
+	Assert(ref != NULL);
+	Assert(ref->refcount > 0);
 
-	if (PrivateRefCount[buffer - 1] > 1)
-		PrivateRefCount[buffer - 1]--;
+	if (ref->refcount > 1)
+		ref->refcount--;
 	else
 		UnpinBuffer(bufHdr, false);
 }
@@ -2572,7 +2801,12 @@ IncrBufferRefCount(Buffer buffer)
 	if (BufferIsLocal(buffer))
 		LocalRefCount[-buffer - 1]++;
 	else
-		PrivateRefCount[buffer - 1]++;
+	{
+		PrivateRefCount *ref;
+		ref = GetPrivateRefCountEntry(buffer, false);
+		Assert(ref != NULL);
+		ref->refcount++;
+	}
 }
 
 /*
@@ -2606,7 +2840,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -2823,9 +3057,9 @@ LockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	if (PrivateRefCount[buffer - 1] != 1)
+	if (GetPrivateRefCount(buffer) != 1)
 		elog(ERROR, "incorrect local pin count: %d",
-			 PrivateRefCount[buffer - 1]);
+			 GetPrivateRefCount(buffer));
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
@@ -2890,7 +3124,7 @@ HoldingBufferPinThatDelaysRecovery(void)
 	if (bufid < 0)
 		return false;
 
-	if (PrivateRefCount[bufid] > 0)
+	if (GetPrivateRefCount(bufid + 1) > 0)
 		return true;
 
 	return false;
@@ -2920,8 +3154,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	Assert(PrivateRefCount[buffer - 1] > 0);
-	if (PrivateRefCount[buffer - 1] != 1)
+	Assert(GetPrivateRefCount(buffer) > 0);
+	if (GetPrivateRefCount(buffer) != 1)
 		return false;
 
 	/* Try to acquire lock */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..42d9120 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -55,7 +55,6 @@ extern int	target_prefetch_pages;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
-extern PGDLLIMPORT int32 *PrivateRefCount;
 
 /* in localbuf.c */
 extern PGDLLIMPORT int NLocBuffer;
@@ -102,24 +101,6 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 )
 
 /*
- * BufferIsPinned
- *		True iff the buffer is pinned (also checks for valid buffer number).
- *
- *		NOTE: what we check here is that *this* backend holds a pin on
- *		the buffer.  We do not care whether some other backend does.
- */
-#define BufferIsPinned(bufnum) \
-( \
-	!BufferIsValid(bufnum) ? \
-		false \
-	: \
-		BufferIsLocal(bufnum) ? \
-			(LocalRefCount[-(bufnum) - 1] > 0) \
-		: \
-			(PrivateRefCount[(bufnum) - 1] > 0) \
-)
-
-/*
  * BufferGetBlock
  *		Returns a reference to a disk page image associated with a buffer.
  *
-- 
1.8.3.251.g1462b67

#2Simon Riggs
simon@2ndQuadrant.com
In reply to: Andres Freund (#1)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 21 March 2014 14:22, Andres Freund <andres@2ndquadrant.com> wrote:

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

We've got to the stage now that saving this much memory is essential,
so this patch is a must-have.

The patch does all I would expect and no more, so approach and details
look good to me.

Performance? Discussed many years ago, but I suspect the micro-tuning
of those earlier patches wasn't as good as it is here.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Andres Freund
andres@2ndquadrant.com
In reply to: Simon Riggs (#2)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 05:34:42 -0400, Simon Riggs wrote:

On 21 March 2014 14:22, Andres Freund <andres@2ndquadrant.com> wrote:

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

We've got to the stage now that saving this much memory is essential,
so this patch is a must-have.

I think some patch like this is necessary - I am not 100% sure mine is
the one true approach here, but it certainly seems simple enough.

Performance? Discussed many years ago, but I suspect the micro-tuning
of those earlier patches wasn't as good as it is here.

It's a small win on small machines (my laptop, 16GB), so we need to
retest with 128GB shared_buffers or such on bigger ones. There
PrivateRefCount previously was the source of a large portion of the
cache misses...

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#4Robert Haas
robertmhaas@gmail.com
In reply to: Simon Riggs (#2)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On Wed, Apr 9, 2014 at 5:34 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

We've got to the stage now that saving this much memory is essential,
so this patch is a must-have.

The patch does all I would expect and no more, so approach and details
look good to me.

Performance? Discussed many years ago, but I suspect the micro-tuning
of those earlier patches wasn't as good as it is here.

I think this approach is practically a slam-dunk when the number of
pins is small (as it typically is). I'm less clear what happens when
we overflow from the small array into the hashtable. That certainly
seems like it could be a loss, but how do we construct such a case to
test it? A session with lots of suspended queries? Can we generate a
regression by starting a few suspended queries to use up the array
elements, and then running a scan that pins and unpins many buffers?

One idea is: if we fill up all the array elements and still need
another one, evict all the elements to the hash table and then start
refilling the array. The advantage of that over what's done here is
that the active scan will always being using an array slot rather than
repeated hash table manipulations. I guess you'd still have to probe
the hash table repeatedly, but you'd avoid entering and removing items
frequently.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Andres Freund
andres@2ndquadrant.com
In reply to: Robert Haas (#4)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 08:22:15 -0400, Robert Haas wrote:

On Wed, Apr 9, 2014 at 5:34 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

We've got to the stage now that saving this much memory is essential,
so this patch is a must-have.

The patch does all I would expect and no more, so approach and details
look good to me.

Performance? Discussed many years ago, but I suspect the micro-tuning
of those earlier patches wasn't as good as it is here.

I think this approach is practically a slam-dunk when the number of
pins is small (as it typically is). I'm less clear what happens when
we overflow from the small array into the hashtable. That certainly
seems like it could be a loss, but how do we construct such a case to
test it? A session with lots of suspended queries? Can we generate a
regression by starting a few suspended queries to use up the array
elements, and then running a scan that pins and unpins many buffers?

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

One idea is: if we fill up all the array elements and still need
another one, evict all the elements to the hash table and then start
refilling the array. The advantage of that over what's done here is
that the active scan will always being using an array slot rather than
repeated hash table manipulations. I guess you'd still have to probe
the hash table repeatedly, but you'd avoid entering and removing items
frequently.

We could do that, but my gut feeling is that it's not necessary. There'd
be some heuristic to avoid doing that all the time, otherwise we'd
probably regress.
I think the fact that we pin/unpin very frequently will put frequently
used pins to the array most of the time.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6Pavan Deolasee
pavan.deolasee@gmail.com
In reply to: Andres Freund (#5)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On Wed, Apr 9, 2014 at 6:02 PM, Andres Freund <andres@2ndquadrant.com>wrote:

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

AFAIR this was suggested before and got rejected because constructing that
worst case and proving that the approach does not perform too badly was a
challenge. Having said that, I agree its time to avoid that memory
allocation, especially with large number of backends running with large
shared buffers.

An orthogonal issue I noted is that we never check for overflow in the ref
count itself. While I understand overflowing int32 counter will take a
large number of pins on the same buffer, it can still happen in the worst
case, no ? Or is there a theoretical limit on the number of pins on the
same buffer by a single backend ?

Thanks,
Pavan

--
Pavan Deolasee
http://www.linkedin.com/in/pavandeolasee

#7Andres Freund
andres@2ndquadrant.com
In reply to: Pavan Deolasee (#6)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 18:13:29 +0530, Pavan Deolasee wrote:

On Wed, Apr 9, 2014 at 6:02 PM, Andres Freund <andres@2ndquadrant.com>wrote:

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

AFAIR this was suggested before and got rejected because constructing that
worst case and proving that the approach does not perform too badly was a
challenge. Having said that, I agree its time to avoid that memory
allocation, especially with large number of backends running with large
shared buffers.

Well, I've tested the worst case by making *all* pins go through the
hash table. And it didn't regress too badly, although it *was* visible
in the profile.
I've searched the archive and to my knowledge nobody has actually sent a
patch implementing this sort of schemes for pins, although there's been
talk about various ways to solve this.

An orthogonal issue I noted is that we never check for overflow in the ref
count itself. While I understand overflowing int32 counter will take a
large number of pins on the same buffer, it can still happen in the worst
case, no ? Or is there a theoretical limit on the number of pins on the
same buffer by a single backend ?

I think we'll die much earlier, because the resource owner array keeping
track of buffer pins will be larger than 1GB.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#5)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On Wed, Apr 9, 2014 at 8:32 AM, Andres Freund <andres@2ndquadrant.com> wrote:

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

Suspended queries won't do it?

Also, it would be good to quantify "not that bad". Actually, this
thread is completely lacking any actual benchmark results...

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Andres Freund
andres@2ndquadrant.com
In reply to: Robert Haas (#8)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 09:17:59 -0400, Robert Haas wrote:

On Wed, Apr 9, 2014 at 8:32 AM, Andres Freund <andres@2ndquadrant.com> wrote:

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

Suspended queries won't do it?

What exactly do you mean by "suspended" queries? Defined and started
portals? Recursive query execution?

Also, it would be good to quantify "not that bad".

The 'not bad' comes from my memory of the benchmarks I'd done after
about 12h of flying around ;).

Yes, it needs real benchmarks. Probably won't get to it the next few
days tho.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#9)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On Wed, Apr 9, 2014 at 9:38 AM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2014-04-09 09:17:59 -0400, Robert Haas wrote:

On Wed, Apr 9, 2014 at 8:32 AM, Andres Freund <andres@2ndquadrant.com> wrote:

I've tried to reproduce problems around this (when I wrote this), but
it's really hard to construct cases that need more than 8 pins. I've
tested performance for those cases by simply not using the array, and
while the performance suffers a bit, it's not that bad.

Suspended queries won't do it?

What exactly do you mean by "suspended" queries? Defined and started
portals? Recursive query execution?

Open a cursor and fetch from it; leave it open while doing other things.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#7)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Andres Freund <andres@2ndquadrant.com> writes:

On 2014-04-09 18:13:29 +0530, Pavan Deolasee wrote:

An orthogonal issue I noted is that we never check for overflow in the ref
count itself. While I understand overflowing int32 counter will take a
large number of pins on the same buffer, it can still happen in the worst
case, no ? Or is there a theoretical limit on the number of pins on the
same buffer by a single backend ?

I think we'll die much earlier, because the resource owner array keeping
track of buffer pins will be larger than 1GB.

The number of pins is bounded, more or less, by the number of scan nodes
in your query plan. You'll have run out of memory trying to plan the
query, assuming you live that long.

The resource managers are interesting to bring up in this context.
That mechanism didn't exist when PrivateRefCount was invented.
Is there a way we could lay off the work onto the resource managers?
(I don't see one right at the moment, but I'm under-caffeinated still.)

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12Andres Freund
andres@2ndquadrant.com
In reply to: Tom Lane (#11)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 10:09:44 -0400, Tom Lane wrote:

The resource managers are interesting to bring up in this context.
That mechanism didn't exist when PrivateRefCount was invented.
Is there a way we could lay off the work onto the resource managers?
(I don't see one right at the moment, but I'm under-caffeinated still.)

Yea, that's something I've also considered, but I couldn't come up with
a performant and sensibly complicated way to do it.
There's some nasty issues with pins held by different ResourceOwners and
such, so even if we could provide sensible random access to check for
existing pins, it wouldn't be a simple thing.

It's not unreasonable to argue that we just shouldn't optimize for
several pins held by the same backend for the same and always touch the
global count. Thanks to resource managers the old reason for
PrivateRefCount, which was the need to be able cleanup remaining pins in
case of error, doesn't exist anymore.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#12)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Andres Freund <andres@2ndquadrant.com> writes:

It's not unreasonable to argue that we just shouldn't optimize for
several pins held by the same backend for the same and always touch the
global count.

NAK. That would be a killer because of increased contention for buffer
headers. The code is full of places where a buffer's PrivateRefCount
jumps up and down a bit, for example when transferring a tuple into a
TupleTableSlot. (I said upthread that the number of pins is bounded by
the number of scan nodes, but actually it's probably some small multiple
of that --- eg a seqscan would hold its own pin on the current buffer,
and there'd be a slot or two holding the current tuple, each with its
own pin count.)

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Andres Freund
andres@2ndquadrant.com
In reply to: Tom Lane (#13)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-04-09 10:26:25 -0400, Tom Lane wrote:

Andres Freund <andres@2ndquadrant.com> writes:

It's not unreasonable to argue that we just shouldn't optimize for
several pins held by the same backend for the same and always touch the
global count.

NAK.

Note I didn't implement it because I wasn't too convinced either ;)

That would be a killer because of increased contention for buffer
headers. The code is full of places where a buffer's PrivateRefCount
jumps up and down a bit, for example when transferring a tuple into a
TupleTableSlot.

On the other hand in those scenarios the backend is pretty likely to
already have the cacheline locally in exclusive mode...

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15Simon Riggs
simon@2ndQuadrant.com
In reply to: Tom Lane (#11)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 9 April 2014 15:09, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Andres Freund <andres@2ndquadrant.com> writes:

On 2014-04-09 18:13:29 +0530, Pavan Deolasee wrote:

An orthogonal issue I noted is that we never check for overflow in the ref
count itself. While I understand overflowing int32 counter will take a
large number of pins on the same buffer, it can still happen in the worst
case, no ? Or is there a theoretical limit on the number of pins on the
same buffer by a single backend ?

I think we'll die much earlier, because the resource owner array keeping
track of buffer pins will be larger than 1GB.

The number of pins is bounded, more or less, by the number of scan nodes
in your query plan. You'll have run out of memory trying to plan the
query, assuming you live that long.

ISTM that there is a strong possibility that the last buffer pinned
will be the next buffer to be unpinned. We can use that to optimise
this.

If we store the last 8 buffers pinned in the fast array then we will
be very likely to hit the right buffer just by scanning the array.

So if we treat the fast array as a circular LRU, we get
* pinning a new buffer when array has an empty slot is O(1)
* pinning a new buffer when array is full causes us to move the LRU
into the hash table and then use that element
* unpinning a buffer will most often be O(1), which then leaves an
empty slot for next pin

Doing it that way means all usage is O(1) apart from when we use >8
pins concurrently and that usage does not follow the regular pattern.

The resource managers are interesting to bring up in this context.
That mechanism didn't exist when PrivateRefCount was invented.
Is there a way we could lay off the work onto the resource managers?
(I don't see one right at the moment, but I'm under-caffeinated still.)

Me neither. Good idea, but I think it would take a lot of refactoring
to do that.

We need to do something about this. We have complaints (via Heikki)
that we are using too much memory in idle backends and small configs,
plus we know we are using too much memory in larger servers. Reducing
the memory usage here will reduce CPU L2 cache churn as well as
increasing available RAM.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Andres Freund
andres@2ndquadrant.com
In reply to: Simon Riggs (#15)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-06-22 12:38:04 +0100, Simon Riggs wrote:

On 9 April 2014 15:09, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Andres Freund <andres@2ndquadrant.com> writes:

On 2014-04-09 18:13:29 +0530, Pavan Deolasee wrote:

An orthogonal issue I noted is that we never check for overflow in the ref
count itself. While I understand overflowing int32 counter will take a
large number of pins on the same buffer, it can still happen in the worst
case, no ? Or is there a theoretical limit on the number of pins on the
same buffer by a single backend ?

I think we'll die much earlier, because the resource owner array keeping
track of buffer pins will be larger than 1GB.

The number of pins is bounded, more or less, by the number of scan nodes
in your query plan. You'll have run out of memory trying to plan the
query, assuming you live that long.

ISTM that there is a strong possibility that the last buffer pinned
will be the next buffer to be unpinned. We can use that to optimise
this.

If we store the last 8 buffers pinned in the fast array then we will
be very likely to hit the right buffer just by scanning the array.

So if we treat the fast array as a circular LRU, we get
* pinning a new buffer when array has an empty slot is O(1)
* pinning a new buffer when array is full causes us to move the LRU
into the hash table and then use that element
* unpinning a buffer will most often be O(1), which then leaves an
empty slot for next pin

Doing it that way means all usage is O(1) apart from when we use >8
pins concurrently and that usage does not follow the regular pattern.

Even that case is O(1) in the average case since insertion into a
hashtable is O(1) on average...

I've started working on a patch that pretty much works like that. It
doesn't move things around in the array, because that seemed to perform
badly. That seems to make sense, because it'd require moving entries in
the relatively common case of two pages being pinned.
It moves one array entry (chosen by [someint++ % NUM_ENTRIES] and moves
it to the hashtable and puts the new item in the now free slot. Same
happens if a lookup hits an entry from the hashtable. It moves one
entry from the array into the hashtable and puts the entry from the
hashtable in the free slot.
That seems to work nicely, but needs some cleanup. And benchmarks.

We need to do something about this. We have complaints (via Heikki)
that we are using too much memory in idle backends and small configs,
plus we know we are using too much memory in larger servers. Reducing
the memory usage here will reduce CPU L2 cache churn as well as
increasing available RAM.

Yea, the buffer pin array currently is one of the biggest sources of
cache misses... In contrast to things like the buffer descriptors it's
not even shared between concurrent processes, so it's more wasteful,
even if small.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17Simon Riggs
simon@2ndQuadrant.com
In reply to: Andres Freund (#16)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 22 June 2014 16:09, Andres Freund <andres@2ndquadrant.com> wrote:

So if we treat the fast array as a circular LRU, we get
* pinning a new buffer when array has an empty slot is O(1)
* pinning a new buffer when array is full causes us to move the LRU
into the hash table and then use that element
* unpinning a buffer will most often be O(1), which then leaves an
empty slot for next pin

Doing it that way means all usage is O(1) apart from when we use >8
pins concurrently and that usage does not follow the regular pattern.

Even that case is O(1) in the average case since insertion into a
hashtable is O(1) on average...

I've started working on a patch that pretty much works like that. It
doesn't move things around in the array, because that seemed to perform
badly. That seems to make sense, because it'd require moving entries in
the relatively common case of two pages being pinned.
It moves one array entry (chosen by [someint++ % NUM_ENTRIES] and moves
it to the hashtable and puts the new item in the now free slot. Same
happens if a lookup hits an entry from the hashtable. It moves one
entry from the array into the hashtable and puts the entry from the
hashtable in the free slot.

Yes, that's roughly how the SLRU code works also, so sounds good.

That seems to work nicely, but needs some cleanup. And benchmarks.

ISTM that microbenchmarks won't reveal the beneficial L2 and RAM
effects of the patch, so I suggest we just need to do a pgbench, a
2-way nested join and a 10-way nested join with an objective of no
significant difference or better. The RAM and L2 effects are enough to
justify this, since it will help with both very small and very large
configs.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Andres Freund
andres@2ndquadrant.com
In reply to: Simon Riggs (#17)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-06-22 19:31:34 +0100, Simon Riggs wrote:

Yes, that's roughly how the SLRU code works also, so sounds good.

Heh. I rather see that as an argument for it sounding bad :)

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19Andres Freund
andres@2ndquadrant.com
In reply to: Andres Freund (#1)
1 attachment(s)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Hi,

On 2014-03-21 19:22:31 +0100, Andres Freund wrote:

Hi,

I've been annoyed at the amount of memory used by the backend local
PrivateRefCount array for a couple of reasons:

a) The performance impact of AtEOXact_Buffers() on Assert() enabled
builds is really, really annoying.
b) On larger nodes, the L1/2/3 cache impact of randomly accessing
several megabyte big array at a high frequency is noticeable. I've
seen the access to that to be the primary (yes, really) source of
pipeline stalls.
c) On nodes with significant shared_memory the sum of the per-backend
arrays is a significant amount of memory, that could very well be
used more beneficially.

So what I have done in the attached proof of concept is to have a small
(8 currently) array of (buffer, pincount) that's searched linearly when
the refcount of a buffer is needed. When more than 8 buffers are pinned
a hashtable is used to lookup the values.

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

Here's the next version of this patch. The major change is that newly
pinned/looked up buffers always go into the array, even when we're
spilling into the array. To get a free slot a preexisting entry (chosen
via PrivateRefCountArray[PrivateRefCountClock++ %
REFCOUNT_ARRAY_ENTRIES]) is displaced into the hash table. That way the
concern that frequently used buffers get 'stuck' in the hashtable while
unfrequently used are in the array is ameliorated.

The biggest concern previously were some benchmarks. I'm not entirely
sure where to get a good testcase for this that's not completely
artificial - most simpler testcases don't pin many buffers. I've played
a bit around and it's a slight performance win in pgbench read only and
mixed workloads, but not enough to get excited about alone.

When asserts are enabled, the story is different. The admittedly extreme
case of readonly pgbench scale 350, with 6GB shared_buffers and 128
clients goes from 3204.489825 39277.077448 TPS. So a) above is
definitely improved :)

The memory savings are clearly visible. During a pgbench scale 350, -cj
128 readonly run the following awk
for pid in $(pgrep -U andres postgres); do
grep VmData /proc/$pid/status;
done | \
awk 'BEGIN { sum = 0 } {sum += $2;} END { if (NR > 0) print sum/NR; else print 0;print sum;print NR}'

shows:

before:
AVG: 4626.06
TOT: 619892
NR: 134

after:
AVG: 1610.37
TOT: 217400
NR: 135

So, the patch is succeeding on c).

On it's own, in pgbench scale 350 -cj 128 -S -T10 the numbers are:
before:
166171.039778, 165488.531353, 165045.182215, 161492.094693 (excluding connections establishing)
after
175812.388869, 171600.928377, 168317.370893, 169860.008865 (excluding connections establishing)

so, a bit of a performance win.

-j 16, -c 16 -S -T10:
before:
159757.637878 161287.658276 164003.676018 160687.951017 162941.627683
after:
160628.774342 163981.064787 151239.151102 164763.851903 165219.220209

I'm too tired to do continue with write tests now, but I don't see a
reason why they should be more meaningful... We really need a test with
more complex queries I'm afraid.

Anyway, I think at this stage this needs somebody to closely look at the
code. I don't think there's going to be any really surprising
performance revelations here.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

0001-Make-backend-local-tracking-of-buffer-pins-memory-ef.patchtext/x-patch; charset=us-asciiDownload
>From 83a6a860517349e5a2cf4e5034019d7f16e896ae Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 25 Aug 2014 19:23:57 +0200
Subject: [PATCH] Make backend local tracking of buffer pins memory efficient.

---
 contrib/pg_buffercache/pg_buffercache_pages.c |   2 +-
 src/backend/storage/buffer/buf_init.c         |  31 +-
 src/backend/storage/buffer/bufmgr.c           | 396 ++++++++++++++++++++++++--
 src/include/storage/bufmgr.h                  |  19 --
 4 files changed, 370 insertions(+), 78 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index b1be98f..d00f985 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -37,7 +37,7 @@ typedef struct
 	/*
 	 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
 	 * being pinned by too many backends and each backend will only pin once
-	 * because of bufmgr.c's PrivateRefCount array.
+	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
 } BufferCachePagesRec;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e03394c..361054c 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -20,7 +20,6 @@
 
 BufferDesc *BufferDescriptors;
 char	   *BufferBlocks;
-int32	   *PrivateRefCount;
 
 
 /*
@@ -59,7 +58,10 @@ int32	   *PrivateRefCount;
  *		once, thus only lock the shared state once; second, when a transaction
  *		aborts, it should only unpin the buffers exactly the number of times it
  *		has pinned them, so that it will not blow away buffers of another
- *		backend.
+ *		backend.  There used to be a per backend array covering all buffers,
+ *		but that obviously implies a noticeably memory overhead that's pretty
+ *		much never requried. So we keep a small array of reference counts
+ *		that is searched linearly and overflow into a hashtable.
  */
 
 
@@ -130,31 +132,6 @@ InitBufferPool(void)
 }
 
 /*
- * Initialize access to shared buffer pool
- *
- * This is called during backend startup (whether standalone or under the
- * postmaster).  It sets up for this backend's access to the already-existing
- * buffer pool.
- *
- * NB: this is called before InitProcess(), so we do not have a PGPROC and
- * cannot do LWLockAcquire; hence we can't actually access stuff in
- * shared memory yet.  We are only initializing local data here.
- * (See also InitBufferPoolBackend, over in bufmgr.c.)
- */
-void
-InitBufferPoolAccess(void)
-{
-	/*
-	 * Allocate and zero local arrays of per-buffer info.
-	 */
-	PrivateRefCount = (int32 *) calloc(NBuffers, sizeof(int32));
-	if (!PrivateRefCount)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-}
-
-/*
  * BufferShmemSize
  *
  * compute the size of shared memory for the buffer pool including
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 938c554..11a88cb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -65,6 +65,14 @@
 
 #define DROP_RELS_BSEARCH_THRESHOLD		20
 
+typedef struct PrivateRefCountEntry
+{
+	Buffer buffer;
+	int32 refcount;
+} PrivateRefCountEntry;
+
+#define REFCOUNT_ARRAY_ENTRIES 8	/* one full cacheline */
+
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -85,6 +93,260 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
+/*
+ * Backend-Private refcount management:
+ *
+ * To avoid - as we used to - using an array with NBuffers entries to keep
+ * track of local buffers we use a small sequentially searched array
+ * (PrivateRefCountArray) and a overflow hash table (PrivateRefCountHash) to
+ * keep track of backend local pins.
+ *
+ * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
+ * refcounts are kept track of in the array, after that new array entries
+ * displace old ones into the hash table. That way a frequently used entry
+ * can't get "stuck" in the hashtable while infrequent ones clog the array.
+ *
+ * Note that in most scenarios the number of pinned buffers will not exceed
+ * REFCOUNT_ARRAY_ENTRIES.
+ */
+static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+static uint32 PrivateRefCountClock = 0;
+
+static PrivateRefCountEntry* GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if create = false is passed and the buffer doesn't have a
+ * PrivateRefCount entry; allocates a new PrivateRefCountEntry if currently
+ * none exists and create = true is passed.
+ *
+ * If do_move is true - only allowed for create = false - the entry is
+ * optimized for frequent access.
+ *
+ * When a returned refcount entry isn't used anymore it has to be forgotten,
+ * using ForgetPrivateRefCountEntry().
+ *
+ * Only works for shared buffers.
+ */
+static PrivateRefCountEntry*
+GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move)
+{
+	PrivateRefCountEntry *res;
+	PrivateRefCountEntry *free = NULL;
+	bool		found = false;
+	int			i;
+
+	Assert(!create || do_move);
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * First search for references in the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer == buffer)
+			return res;
+
+		/* Remember where to put a new refcount, should it become necessary. */
+		if (free == NULL && res->buffer == InvalidBuffer)
+			free = res;
+	}
+
+	/*
+	 * By here we know that the buffer, if already pinned, isn't residing in
+	 * the array.
+	 */
+	res = NULL;
+	found = false;
+
+	/*
+	 * Look up the buffer in the hashtable if we've previously overflowed into
+	 * it.
+	 */
+	if (PrivateRefCountOverflowed > 0)
+	{
+		res = hash_search(PrivateRefCountHash,
+						  (void *) &buffer,
+						  HASH_FIND,
+						  &found);
+	}
+
+	if (!found && !create)
+	{
+		/* Neither array nor hash have an entry and no new entry is needed */
+		return NULL;
+	}
+	else if (!found && free != NULL)
+	{
+		/* add entry into the free array slot */
+		free->buffer = buffer;
+		free->refcount = 0;
+
+		return free;
+	}
+	else if (!found)
+	{
+		/*
+		 * Move entry from the current clock position in the array into the
+		 * hashtable. Use that slot.
+		 */
+		PrivateRefCountEntry *arrayent;
+		PrivateRefCountEntry *hashent;
+
+		arrayent = &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+
+		/* enter array entry into hashtable */
+		hashent = hash_search(PrivateRefCountHash,
+						  (void *) &arrayent->buffer,
+						  HASH_ENTER,
+						  &found);
+		Assert(!found);
+		hashent->refcount = arrayent->refcount;
+
+		/* fill the now free array slot */
+		arrayent->buffer = buffer;
+		arrayent->refcount = 0;
+
+		PrivateRefCountOverflowed++;
+
+		return arrayent;
+	}
+	else if (found && !do_move)
+	{
+		return res;
+	}
+	else if (found && free != NULL)
+	{
+		/* move buffer from hashtable into the free array slot */
+		free->buffer = buffer;
+		free->refcount = res->refcount;
+
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+
+		return free;
+	}
+	else if (found)
+	{
+		/*
+		 * Swap the entry in the hash table with the one in the array at the
+		 * current clock position.
+		 */
+		PrivateRefCountEntry *arrayent;
+		PrivateRefCountEntry *hashent;
+
+		arrayent = &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+
+		Assert(arrayent->buffer != InvalidBuffer);
+
+		/* enter array entry */
+		hashent = hash_search(PrivateRefCountHash,
+						  (void *) &arrayent->buffer,
+						  HASH_ENTER,
+						  &found);
+		Assert(!found);
+		hashent->refcount = arrayent->refcount;
+
+		/* fill array entry with previously searched entry */
+		arrayent->buffer = res->buffer;
+		arrayent->refcount = res->refcount;
+
+		/* and remove the old entry */
+		hash_search(PrivateRefCountHash,
+					(void *) &res->buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+
+		/* PrivateRefCountOverflowed stays the same -1 + +1 = 0*/
+
+		return arrayent;
+	}
+
+	Assert(false); /* unreachable */
+	return res;
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+	PrivateRefCountEntry *ref;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	ref = GetPrivateRefCountEntry(buffer, false, false);
+
+	if (ref == NULL)
+		return 0;
+	return ref->refcount;
+}
+
+/*
+ * Stop tracking the refcount of the buffer ref is tracking the refcount
+ * for. Nono, there's no circularity here.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
+{
+	Assert(ref->refcount == 0);
+
+	if (ref >= &PrivateRefCountArray[0] &&
+		ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+	{
+		ref->buffer = InvalidBuffer;
+	}
+	else
+	{
+		bool found;
+		Buffer buffer = ref->buffer;
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+	}
+}
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+	(GetPrivateRefCount(bufnum) > 0) \
+)
+
 
 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
@@ -940,7 +1202,7 @@ retry:
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
-		if (PrivateRefCount[buf->buf_id] != 0)
+		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
 		goto retry;
@@ -999,7 +1261,7 @@ MarkBufferDirty(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -1046,9 +1308,9 @@ ReleaseAndReadBuffer(Buffer buffer,
 
 	if (BufferIsValid(buffer))
 	{
+		Assert(BufferIsPinned(buffer));
 		if (BufferIsLocal(buffer))
 		{
-			Assert(LocalRefCount[-buffer - 1] > 0);
 			bufHdr = &LocalBufferDescriptors[-buffer - 1];
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
@@ -1059,7 +1321,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 		}
 		else
 		{
-			Assert(PrivateRefCount[buffer - 1] > 0);
 			bufHdr = &BufferDescriptors[buffer - 1];
 			/* we have pin, so it's ok to examine tag without spinlock */
 			if (bufHdr->tag.blockNum == blockNum &&
@@ -1096,8 +1357,11 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 {
 	int			b = buf->buf_id;
 	bool		result;
+	PrivateRefCountEntry *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 	{
 		LockBufHdr(buf);
 		buf->refcount++;
@@ -1119,8 +1383,9 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		/* If we previously pinned the buffer, it must surely be valid */
 		result = true;
 	}
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 	return result;
@@ -1143,12 +1408,15 @@ static void
 PinBuffer_Locked(volatile BufferDesc *buf)
 {
 	int			b = buf->buf_id;
+	PrivateRefCountEntry *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 		buf->refcount++;
 	UnlockBufHdr(buf);
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 }
@@ -1164,15 +1432,19 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 static void
 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 {
+	PrivateRefCountEntry *ref;
 	int			b = buf->buf_id;
 
+	ref = GetPrivateRefCountEntry(b + 1, false, false);
+	Assert(ref != NULL);
+
 	if (fixOwner)
 		ResourceOwnerForgetBuffer(CurrentResourceOwner,
 								  BufferDescriptorGetBuffer(buf));
 
-	Assert(PrivateRefCount[b] > 0);
-	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0)
+	Assert(ref->refcount > 0);
+	ref->refcount--;
+	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
@@ -1197,6 +1469,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		}
 		else
 			UnlockBufHdr(buf);
+
+		ForgetPrivateRefCountEntry(ref);
 	}
 }
 
@@ -1702,6 +1976,10 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 
 /*
  *		AtEOXact_Buffers - clean up at end of transaction.
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
  */
 void
 AtEOXact_Buffers(bool isCommit)
@@ -1709,6 +1987,36 @@ AtEOXact_Buffers(bool isCommit)
 	CheckForBufferLeaks();
 
 	AtEOXact_LocalBuffers(isCommit);
+
+	Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster).  It sets up for this backend's access to the already-existing
+ * buffer pool.
+ *
+ * NB: this is called before InitProcess(), so we do not have a PGPROC and
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
+ * shared memory yet.  We are only initializing local data here.
+ * (See also InitBufferPoolBackend)
+ */
+void
+InitBufferPoolAccess(void)
+{
+	HASHCTL		hash_ctl;
+
+	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(int32);
+	hash_ctl.entrysize = sizeof(PrivateRefCountArray);
+	hash_ctl.hash = oid_hash; /* a bit more efficient than tag_hash */
+
+	PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION);
 }
 
 /*
@@ -1754,16 +2062,34 @@ CheckForBufferLeaks(void)
 {
 #ifdef USE_ASSERT_CHECKING
 	int			RefCountErrors = 0;
-	Buffer		b;
+	PrivateRefCountEntry *res;
+	int			i;
+
+	/* check the array */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer != InvalidBuffer)
+		{
+			PrintBufferLeakWarning(res->buffer);
+			RefCountErrors++;
+		}
+	}
 
-	for (b = 1; b <= NBuffers; b++)
+	/* if neccessary search the hash */
+	if (PrivateRefCountOverflowed)
 	{
-		if (PrivateRefCount[b - 1] != 0)
+		HASH_SEQ_STATUS hstat;
+		hash_seq_init(&hstat, PrivateRefCountHash);
+		while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
 		{
-			PrintBufferLeakWarning(b);
+			PrintBufferLeakWarning(res->buffer);
 			RefCountErrors++;
 		}
+
 	}
+
 	Assert(RefCountErrors == 0);
 #endif
 }
@@ -1789,7 +2115,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	else
 	{
 		buf = &BufferDescriptors[buffer - 1];
-		loccount = PrivateRefCount[buffer - 1];
+		loccount = GetPrivateRefCount(buffer);
 		backend = InvalidBackendId;
 	}
 
@@ -2329,7 +2655,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, PrivateRefCount[i]);
+			 buf->refcount, GetPrivateRefCount(i));
 	}
 }
 #endif
@@ -2343,7 +2669,7 @@ PrintPinnedBufs(void)
 
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
-		if (PrivateRefCount[i] > 0)
+		if (GetPrivateRefCount(i + 1) > 0)
 		{
 			/* theoretically we should lock the bufhdr here */
 			elog(LOG,
@@ -2352,7 +2678,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpath(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, PrivateRefCount[i]);
+				 buf->refcount, GetPrivateRefCount(i + 1));
 		}
 	}
 }
@@ -2509,6 +2835,7 @@ void
 ReleaseBuffer(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	PrivateRefCountEntry *ref;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -2524,10 +2851,12 @@ ReleaseBuffer(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	ref = GetPrivateRefCountEntry(buffer, false, false);
+	Assert(ref != NULL);
+	Assert(ref->refcount > 0);
 
-	if (PrivateRefCount[buffer - 1] > 1)
-		PrivateRefCount[buffer - 1]--;
+	if (ref->refcount > 1)
+		ref->refcount--;
 	else
 		UnpinBuffer(bufHdr, false);
 }
@@ -2561,7 +2890,12 @@ IncrBufferRefCount(Buffer buffer)
 	if (BufferIsLocal(buffer))
 		LocalRefCount[-buffer - 1]++;
 	else
-		PrivateRefCount[buffer - 1]++;
+	{
+		PrivateRefCountEntry *ref;
+		ref = GetPrivateRefCountEntry(buffer, false, true);
+		Assert(ref != NULL);
+		ref->refcount++;
+	}
 }
 
 /*
@@ -2595,7 +2929,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -2813,9 +3147,9 @@ LockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	if (PrivateRefCount[buffer - 1] != 1)
+	if (GetPrivateRefCount(buffer) != 1)
 		elog(ERROR, "incorrect local pin count: %d",
-			 PrivateRefCount[buffer - 1]);
+			 GetPrivateRefCount(buffer));
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
@@ -2880,7 +3214,7 @@ HoldingBufferPinThatDelaysRecovery(void)
 	if (bufid < 0)
 		return false;
 
-	if (PrivateRefCount[bufid] > 0)
+	if (GetPrivateRefCount(bufid + 1) > 0)
 		return true;
 
 	return false;
@@ -2910,8 +3244,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	Assert(PrivateRefCount[buffer - 1] > 0);
-	if (PrivateRefCount[buffer - 1] != 1)
+	Assert(GetPrivateRefCount(buffer) > 0);
+	if (GetPrivateRefCount(buffer) != 1)
 		return false;
 
 	/* Try to acquire lock */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..42d9120 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -55,7 +55,6 @@ extern int	target_prefetch_pages;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
-extern PGDLLIMPORT int32 *PrivateRefCount;
 
 /* in localbuf.c */
 extern PGDLLIMPORT int NLocBuffer;
@@ -102,24 +101,6 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 )
 
 /*
- * BufferIsPinned
- *		True iff the buffer is pinned (also checks for valid buffer number).
- *
- *		NOTE: what we check here is that *this* backend holds a pin on
- *		the buffer.  We do not care whether some other backend does.
- */
-#define BufferIsPinned(bufnum) \
-( \
-	!BufferIsValid(bufnum) ? \
-		false \
-	: \
-		BufferIsLocal(bufnum) ? \
-			(LocalRefCount[-(bufnum) - 1] > 0) \
-		: \
-			(PrivateRefCount[(bufnum) - 1] > 0) \
-)
-
-/*
  * BufferGetBlock
  *		Returns a reference to a disk page image associated with a buffer.
  *
-- 
2.0.0.rc2.4.g1dc51c6.dirty

#20Jim Nasby
jim@nasby.net
In reply to: Andres Freund (#19)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 8/26/14, 6:52 PM, Andres Freund wrote:

On 2014-03-21 19:22:31 +0100, Andres Freund wrote:

Hi,

I've been annoyed at the amount of memory used by the backend local
PrivateRefCount array for a couple of reasons:

a) The performance impact of AtEOXact_Buffers() on Assert() enabled
builds is really, really annoying.
b) On larger nodes, the L1/2/3 cache impact of randomly accessing
several megabyte big array at a high frequency is noticeable. I've
seen the access to that to be the primary (yes, really) source of
pipeline stalls.
c) On nodes with significant shared_memory the sum of the per-backend
arrays is a significant amount of memory, that could very well be
used more beneficially.

So what I have done in the attached proof of concept is to have a small
(8 currently) array of (buffer, pincount) that's searched linearly when
the refcount of a buffer is needed. When more than 8 buffers are pinned
a hashtable is used to lookup the values.

That seems to work fairly well. On the few tests I could run on my
laptop - I've done this during a flight - it's a small performance win
in all cases I could test. While saving a fair amount of memory.

Here's the next version of this patch. The major change is that newly

<snip>

The memory savings are clearly visible. During a pgbench scale 350, -cj
128 readonly run the following awk
for pid in $(pgrep -U andres postgres); do
grep VmData/proc/$pid/status;
done | \
awk 'BEGIN { sum = 0 } {sum += $2;} END { if (NR > 0) print sum/NR; else print 0;print sum;print NR}'

shows:

before:
AVG: 4626.06
TOT: 619892
NR: 134

after:
AVG: 1610.37
TOT: 217400
NR: 135

These results look very encouraging, especially thinking about the cache impact. It occurs to me that it'd also be nice to have some stats available on how this is performing; perhaps a dtrace probe for whenever we overflow to the hash table, and one that shows maximum usage for a statement? (Presumably that's not much extra code or overhead...)
--
Jim C. Nasby, Data Architect jim@nasby.net
512.569.9461 (cell) http://jim.nasby.net

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#19)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On Tue, Aug 26, 2014 at 7:52 PM, Andres Freund <andres@2ndquadrant.com> wrote:

Here's the next version of this patch.

+ * much never requried. So we keep a small array of reference counts

Typo. But I think you could just drop the whole sentence about how
things used to be, especially since it's recapitulated elsewhere.

+#define REFCOUNT_ARRAY_ENTRIES 8 /* one full cacheline */

Obviously that's not always going to be the case. You could say
"about", or just drop the comment. Shouldn't "cache line" be two
words?

+ * refcounts are kept track of in the array, after that new array entries

s/, after that/; after that,/

+    if (!found && !create)
+    else if (!found && free != NULL)
+    else if (!found)
+    else if (found && !do_move)
+    else if (found && free != NULL)
+    else if (found)
+    Assert(false); /* unreachable */
+    return res;

There's not much point in testing found when you've already handled
the not-found cases. But I'd reorganize this whole thing like this:

if (!found) { if (!create) { return; } if (free != NULL) { stuff;
return }; stuff; return; }
if (!do_move) { return; } if (free != NULL) { stuff; return; } stuff; return;

+ * Stop tracking the refcount of the buffer ref is tracking the refcount
+ * for. Nono, there's no circularity here.

Incomprehensible babble. Perhaps: "Release resources used to track
the reference count of a buffer which we no longer have pinned."

That's all I see on a first-read through. There might be other
issues, and I haven't checked through it in great detail for mundane
bugs, but generally, I favor pressing on relatively rapidly toward a
commit. It seems highly likely that this idea is a big win, and if
there's some situation in which it's a loss, we're more likely to find
out with it in the tree (and thus likely to be tested by many more
people) than by analysis from first principles.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#19)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Andres Freund <andres@2ndquadrant.com> writes:

The biggest concern previously were some benchmarks. I'm not entirely
sure where to get a good testcase for this that's not completely
artificial - most simpler testcases don't pin many buffers.

FWIW, I think that's by design; we don't ever want to pin more than one
buffer per relation+index in use in a given query. You could certainly
build complicated queries joining many tables in order to push up the
number of pinned buffers, but whether buffer pin manipulations would be
the bottleneck in such cases is pretty dubious.

I would say that the issue most deserving of performance testing is your
sizing of the linear-search array --- it's not obvious that 8 is a good
size.

Another thing to think about: a way to get to larger numbers of pinned
buffers without especially-complex queries is to have nested queries,
such as SQL queries inside plpgsql functions inside outer queries.
Does the patch logic take any advantage of the locality-of-reference
that will occur in such scenarios?

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23Andres Freund
andres@2ndquadrant.com
In reply to: Tom Lane (#22)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-08-26 22:19:47 -0400, Tom Lane wrote:

Andres Freund <andres@2ndquadrant.com> writes:

The biggest concern previously were some benchmarks. I'm not entirely
sure where to get a good testcase for this that's not completely
artificial - most simpler testcases don't pin many buffers.

FWIW, I think that's by design; we don't ever want to pin more than one
buffer per relation+index in use in a given query.

Right.

You could certainly
build complicated queries joining many tables in order to push up the
number of pinned buffers, but whether buffer pin manipulations would be
the bottleneck in such cases is pretty dubious.

Yea, I actually tried that and I didn't see anything.

I would say that the issue most deserving of performance testing is your
sizing of the linear-search array --- it's not obvious that 8 is a good
size.

It's about the size of a cacheline on all common architectures, that's
how I found it. I don't think it makes a very big difference whether we
make it 4 or 12, but outside of that range I think it'll be unlikely to
be beneficial. The regression tests never go about three or four pins or
so currently, so I think that's a number unlikely to regularly be
crossed in practice.

Another thing to think about: a way to get to larger numbers of pinned
buffers without especially-complex queries is to have nested queries,
such as SQL queries inside plpgsql functions inside outer queries.

What I did was hack together a pgbench script that does a lot of
DECLARE c_01 CURSOR FOR SELECT * FROM pg_attribute WHERE ctid = '(0, 1)';
FETCH NEXT FROM c_01;

I couldn't measure a bigger slowdown (as that has to be executed for
every xact) for the new code than for the old one.

Does the patch logic take any advantage of the locality-of-reference
that will occur in such scenarios?

Yes. Whenever a buffer is pinned/unpinned that's not in the array it'll
displace an entry from the array into the hashtable. Even though the
replacement is simplistic/linear I think that should nearly always end
up with the last used buffers in the array.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#24Andres Freund
andres@2ndquadrant.com
In reply to: Robert Haas (#21)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-08-26 22:04:03 -0400, Robert Haas wrote:

On Tue, Aug 26, 2014 at 7:52 PM, Andres Freund <andres@2ndquadrant.com> wrote:

Here's the next version of this patch.

+ * much never requried. So we keep a small array of reference counts

Typo. But I think you could just drop the whole sentence about how
things used to be, especially since it's recapitulated elsewhere.

Ok. I actually wonder about chucking out the whole explanation in
buf_init.c. There's been something there historically, but it's not
really a better place than just keeping everything in bufmgr.c.

+#define REFCOUNT_ARRAY_ENTRIES 8 /* one full cacheline */

Obviously that's not always going to be the case. You could say
"about", or just drop the comment. Shouldn't "cache line" be two
words?

Ok, will make it /* one cache line in common architectures */ - I want
the reasoning for the current size somewhere...

+ * refcounts are kept track of in the array, after that new array entries

s/, after that/; after that,/

+    if (!found && !create)
+    else if (!found && free != NULL)
+    else if (!found)
+    else if (found && !do_move)
+    else if (found && free != NULL)
+    else if (found)
+    Assert(false); /* unreachable */
+    return res;

There's not much point in testing found when you've already handled
the not-found cases. But I'd reorganize this whole thing like this:

if (!found) { if (!create) { return; } if (free != NULL) { stuff;
return }; stuff; return; }
if (!do_move) { return; } if (free != NULL) { stuff; return; } stuff; return;

The current if () ... isn't particularly nice, I agree.

That's all I see on a first-read through. There might be other
issues, and I haven't checked through it in great detail for mundane
bugs, but generally, I favor pressing on relatively rapidly toward a
commit. It seems highly likely that this idea is a big win, and if
there's some situation in which it's a loss, we're more likely to find
out with it in the tree (and thus likely to be tested by many more
people) than by analysis from first principles.

I agree. As long as people are happy with the approach I think we can
iron out performance edge cases later.

I'll try to send a cleaned up version soon. I'm currently wondering
about adding some minimal regression test coverage for this. What I have
right now is stuff like
DECLARE c_01 CURSOR FOR SELECT * FROM pg_attribute WHERE ctid = '(0, 1)';
DECLARE c_02 CURSOR FOR SELECT * FROM pg_attribute WHERE ctid = '(1, 1)';
...
FETCH NEXT FROM c_01;
FETCH NEXT FROM c_02;
...
CLOSE c_01;
...

While that provides some coverage, I'm unconvinced that it's appropriate
for the regression tests?

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Andres Freund
andres@2ndquadrant.com
In reply to: Jim Nasby (#20)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-08-26 20:44:32 -0500, Jim Nasby wrote:

These results look very encouraging, especially thinking about the
cache impact.

Yep. I've seen PrivateRefCount array accesses prominently in the source
of cache misses in big servers.

It occurs to me that it'd also be nice to have some
stats available on how this is performing; perhaps a dtrace probe for
whenever we overflow to the hash table, and one that shows maximum
usage for a statement? (Presumably that's not much extra code or
overhead...)

I don't use dtrace, so *I* won't do that. Personally I just dynamically
add probes using "perf probe" when I need to track something like this.

I don't see how you could track maximum usage without more
compliations/slowdowns than warranted.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#26Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Andres Freund (#23)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

Andres Freund wrote:

On 2014-08-26 22:19:47 -0400, Tom Lane wrote:

Andres Freund <andres@2ndquadrant.com> writes:

I would say that the issue most deserving of performance testing is your
sizing of the linear-search array --- it's not obvious that 8 is a good
size.

It's about the size of a cacheline on all common architectures, that's
how I found it. I don't think it makes a very big difference whether we
make it 4 or 12, but outside of that range I think it'll be unlikely to
be beneficial. The regression tests never go about three or four pins or
so currently, so I think that's a number unlikely to regularly be
crossed in practice.

FWIW scanning a minmax index will keep three pages pinned IIRC
(metapage, current revmap page, current regular page).

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#27Jim Nasby
jim@nasby.net
In reply to: Andres Freund (#25)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 8/27/14, 1:38 AM, Andres Freund wrote:

It occurs to me that it'd also be nice to have some

stats available on how this is performing; perhaps a dtrace probe for
whenever we overflow to the hash table, and one that shows maximum
usage for a statement? (Presumably that's not much extra code or
overhead...)

I don't use dtrace, so*I* won't do that. Personally I just dynamically
add probes using "perf probe" when I need to track something like this.

Yeah, I didn't mean dtrace directly; don't we have some macro that equates to dtrace or perf-probe depending on architecture?

I don't see how you could track maximum usage without more
compliations/slowdowns than warranted.

I was thinking we'd only show maximum if we overflowed, but maybe it's still too much overhead in that case.

In any case, I was thinking this would be trivial to add now, but if it's not then someone can do it when there's actual need.
--
Jim C. Nasby, Data Architect jim@nasby.net
512.569.9461 (cell) http://jim.nasby.net

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28Andres Freund
andres@2ndquadrant.com
In reply to: Robert Haas (#21)
1 attachment(s)
Re: [RFC, POC] Don't require a NBuffer sized PrivateRefCount array of local buffer pins

On 2014-08-26 22:04:03 -0400, Robert Haas wrote:

That's all I see on a first-read through.

I think I fixed all of them. Thanks.

There might be other
issues, and I haven't checked through it in great detail for mundane
bugs, but generally, I favor pressing on relatively rapidly toward a
commit. It seems highly likely that this idea is a big win, and if
there's some situation in which it's a loss, we're more likely to find
out with it in the tree (and thus likely to be tested by many more
people) than by analysis from first principles.

Attached is the version I plan to commit after going over it again
tomorrow.

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

0001-Make-backend-local-tracking-of-buffer-pins-memory-ef.patchtext/x-patch; charset=us-asciiDownload
>From b48a28eee3518548740525243d1a165720a67231 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 29 Aug 2014 23:31:37 +0200
Subject: [PATCH] Make backend local tracking of buffer pins memory efficient.

Since the dawn of time (aka Postgres95) multiple pins of the same
buffer by one backend have been optimized not to modify the shared
refcount more than once. This optimization has always used a NBuffer
sized array in each backend keeping track of a backend's pins.

That array (PrivateRefCount) was one of the biggest per-backend memory
allocations, depending on the shared_buffers setting. Besides the
waste of memory it also has proven to be a performance bottleneck when
assertions are enabled as we make sure that there's no remaining pins
left at the end of transactions. Also, on servers with lots of memory
and a correspondingly high shared_buffers setting the amount of random
memory accesses can also lead to poor cpu cache efficiency.

Because of these reasons a backend's buffers pins are now kept track
of in a small statically sized array that overflows into a hash table
when necessary. Benchmarks have shown neutral to positive performance
results with considerably lower memory usage.

Patch by me, review by Robert Haas.
---
 contrib/pg_buffercache/pg_buffercache_pages.c |   2 +-
 src/backend/storage/buffer/buf_init.c         |  39 +--
 src/backend/storage/buffer/bufmgr.c           | 418 ++++++++++++++++++++++++--
 src/include/storage/bufmgr.h                  |  19 --
 4 files changed, 391 insertions(+), 87 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index b1be98f..d00f985 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -37,7 +37,7 @@ typedef struct
 	/*
 	 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
 	 * being pinned by too many backends and each backend will only pin once
-	 * because of bufmgr.c's PrivateRefCount array.
+	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
 } BufferCachePagesRec;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e03394c..ff6c713 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -20,7 +20,6 @@
 
 BufferDesc *BufferDescriptors;
 char	   *BufferBlocks;
-int32	   *PrivateRefCount;
 
 
 /*
@@ -50,16 +49,9 @@ int32	   *PrivateRefCount;
  *
  * refcount --	Counts the number of processes holding pins on a buffer.
  *		A buffer is pinned during IO and immediately after a BufferAlloc().
- *		Pins must be released before end of transaction.
- *
- * PrivateRefCount -- Each buffer also has a private refcount that keeps
- *		track of the number of times the buffer is pinned in the current
- *		process.    This is used for two purposes: first, if we pin a
- *		a buffer more than once, we only need to change the shared refcount
- *		once, thus only lock the shared state once; second, when a transaction
- *		aborts, it should only unpin the buffers exactly the number of times it
- *		has pinned them, so that it will not blow away buffers of another
- *		backend.
+ *		Pins must be released before end of transaction.  For efficiency the
+ *		shared refcount isn't increased if a individual backend pins a buffer
+ *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
  */
 
 
@@ -130,31 +122,6 @@ InitBufferPool(void)
 }
 
 /*
- * Initialize access to shared buffer pool
- *
- * This is called during backend startup (whether standalone or under the
- * postmaster).  It sets up for this backend's access to the already-existing
- * buffer pool.
- *
- * NB: this is called before InitProcess(), so we do not have a PGPROC and
- * cannot do LWLockAcquire; hence we can't actually access stuff in
- * shared memory yet.  We are only initializing local data here.
- * (See also InitBufferPoolBackend, over in bufmgr.c.)
- */
-void
-InitBufferPoolAccess(void)
-{
-	/*
-	 * Allocate and zero local arrays of per-buffer info.
-	 */
-	PrivateRefCount = (int32 *) calloc(NBuffers, sizeof(int32));
-	if (!PrivateRefCount)
-		ereport(FATAL,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-}
-
-/*
  * BufferShmemSize
  *
  * compute the size of shared memory for the buffer pool including
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 938c554..3240432 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -65,6 +65,15 @@
 
 #define DROP_RELS_BSEARCH_THRESHOLD		20
 
+typedef struct PrivateRefCountEntry
+{
+	Buffer buffer;
+	int32 refcount;
+} PrivateRefCountEntry;
+
+/* 64 bytes, about the size of a cache line on common systems */
+#define REFCOUNT_ARRAY_ENTRIES 8
+
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -85,6 +94,281 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
+/*
+ * Backend-Private refcount management:
+ *
+ * Each buffer also has a private refcount that keeps track of the number of
+ * times the buffer is pinned in the current process.  This is so that the
+ * shared refcount needs to be modified only once if a buffer is pinned more
+ * than once by a individual backend.  It's also used to check that no buffers
+ * are still pinned at the end of transactions and when exiting.
+ *
+ *
+ * To avoid - as we used to - requiring an array with NBuffers entries to keep
+ * track of local buffers we use a small sequentially searched array
+ * (PrivateRefCountArray) and a overflow hash table (PrivateRefCountHash) to
+ * keep track of backend local pins.
+ *
+ * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
+ * refcounts are kept track of in the array; after that, new array entries
+ * displace old ones into the hash table. That way a frequently used entry
+ * can't get "stuck" in the hashtable while infrequent ones clog the array.
+ *
+ * Note that in most scenarios the number of pinned buffers will not exceed
+ * REFCOUNT_ARRAY_ENTRIES.
+ */
+static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+static uint32 PrivateRefCountClock = 0;
+
+static PrivateRefCountEntry* GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if create = false is passed and the buffer doesn't have a
+ * PrivateRefCount entry; allocates a new PrivateRefCountEntry if currently
+ * none exists and create = true is passed.
+ *
+ * If do_move is true - only allowed for create = false - the entry is
+ * optimized for frequent access.
+ *
+ * When a returned refcount entry isn't used anymore it has to be forgotten,
+ * using ForgetPrivateRefCountEntry().
+ *
+ * Only works for shared buffers.
+ */
+static PrivateRefCountEntry*
+GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move)
+{
+	PrivateRefCountEntry *res;
+	PrivateRefCountEntry *free = NULL;
+	bool		found = false;
+	int			i;
+
+	Assert(!create || do_move);
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * First search for references in the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer == buffer)
+			return res;
+
+		/* Remember where to put a new refcount, should it become necessary. */
+		if (free == NULL && res->buffer == InvalidBuffer)
+			free = res;
+	}
+
+	/*
+	 * By here we know that the buffer, if already pinned, isn't residing in
+	 * the array.
+	 */
+	res = NULL;
+	found = false;
+
+	/*
+	 * Look up the buffer in the hashtable if we've previously overflowed into
+	 * it.
+	 */
+	if (PrivateRefCountOverflowed > 0)
+	{
+		res = hash_search(PrivateRefCountHash,
+						  (void *) &buffer,
+						  HASH_FIND,
+						  &found);
+	}
+
+	if (!found)
+	{
+		if (!create)
+		{
+			/* Neither array nor hash have an entry and no new entry is needed */
+			return NULL;
+		}
+		else if (free != NULL)
+		{
+			/* add entry into the free array slot */
+			free->buffer = buffer;
+			free->refcount = 0;
+
+			return free;
+		}
+		else
+		{
+			/*
+			 * Move entry from the current clock position in the array into the
+			 * hashtable. Use that slot.
+			 */
+			PrivateRefCountEntry *arrayent;
+			PrivateRefCountEntry *hashent;
+
+			/* select victim slot */
+			arrayent = &PrivateRefCountArray[
+				PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+			Assert(arrayent->buffer != InvalidBuffer);
+
+			/* enter victim array entry into hashtable */
+			hashent = hash_search(PrivateRefCountHash,
+								  (void *) &arrayent->buffer,
+								  HASH_ENTER,
+								  &found);
+			Assert(!found);
+			hashent->refcount = arrayent->refcount;
+
+			/* fill the now free array slot */
+			arrayent->buffer = buffer;
+			arrayent->refcount = 0;
+
+			PrivateRefCountOverflowed++;
+
+			return arrayent;
+
+		}
+	}
+	else
+	{
+		if (!do_move)
+		{
+			return res;
+		}
+		else if (found && free != NULL)
+		{
+			/* move buffer from hashtable into the free array slot */
+
+			/* fill array slot */
+			free->buffer = buffer;
+			free->refcount = res->refcount;
+
+			/* delete from hashtable */
+			hash_search(PrivateRefCountHash,
+						(void *) &buffer,
+						HASH_REMOVE,
+						&found);
+			Assert(found);
+			Assert(PrivateRefCountOverflowed > 0);
+			PrivateRefCountOverflowed--;
+
+			return free;
+		}
+		else
+		{
+			/*
+			 * Swap the entry in the hash table with the one in the array at the
+			 * current clock position.
+			 */
+			PrivateRefCountEntry *arrayent;
+			PrivateRefCountEntry *hashent;
+
+			/* select victim slot */
+			arrayent = &PrivateRefCountArray[
+				PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+			Assert(arrayent->buffer != InvalidBuffer);
+
+			/* enter victim entry into the hashtable */
+			hashent = hash_search(PrivateRefCountHash,
+								  (void *) &arrayent->buffer,
+								  HASH_ENTER,
+								  &found);
+			Assert(!found);
+			hashent->refcount = arrayent->refcount;
+
+			/* fill now free array entry with previously searched entry */
+			arrayent->buffer = res->buffer;
+			arrayent->refcount = res->refcount;
+
+			/* and remove the old entry */
+			hash_search(PrivateRefCountHash,
+						(void *) &arrayent->buffer,
+						HASH_REMOVE,
+						&found);
+			Assert(found);
+
+			/* PrivateRefCountOverflowed stays the same -1 + +1 = 0*/
+
+			return arrayent;
+		}
+	}
+
+	Assert(false); /* unreachable */
+	return NULL;
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+	PrivateRefCountEntry *ref;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	ref = GetPrivateRefCountEntry(buffer, false, false);
+
+	if (ref == NULL)
+		return 0;
+	return ref->refcount;
+}
+
+/*
+ * Release resources used to track the reference count of a buffer which we no
+ * longer have pinned and don't want to pin again immediately.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
+{
+	Assert(ref->refcount == 0);
+
+	if (ref >= &PrivateRefCountArray[0] &&
+		ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+	{
+		ref->buffer = InvalidBuffer;
+	}
+	else
+	{
+		bool found;
+		Buffer buffer = ref->buffer;
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+	}
+}
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+	(GetPrivateRefCount(bufnum) > 0) \
+)
+
 
 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
@@ -940,7 +1224,7 @@ retry:
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
-		if (PrivateRefCount[buf->buf_id] != 0)
+		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
 		goto retry;
@@ -999,7 +1283,7 @@ MarkBufferDirty(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -1046,9 +1330,9 @@ ReleaseAndReadBuffer(Buffer buffer,
 
 	if (BufferIsValid(buffer))
 	{
+		Assert(BufferIsPinned(buffer));
 		if (BufferIsLocal(buffer))
 		{
-			Assert(LocalRefCount[-buffer - 1] > 0);
 			bufHdr = &LocalBufferDescriptors[-buffer - 1];
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
@@ -1059,7 +1343,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 		}
 		else
 		{
-			Assert(PrivateRefCount[buffer - 1] > 0);
 			bufHdr = &BufferDescriptors[buffer - 1];
 			/* we have pin, so it's ok to examine tag without spinlock */
 			if (bufHdr->tag.blockNum == blockNum &&
@@ -1096,8 +1379,11 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 {
 	int			b = buf->buf_id;
 	bool		result;
+	PrivateRefCountEntry *ref;
 
-	if (PrivateRefCount[b] == 0)
+	ref = GetPrivateRefCountEntry(b + 1, true, true);
+
+	if (ref->refcount == 0)
 	{
 		LockBufHdr(buf);
 		buf->refcount++;
@@ -1119,8 +1405,9 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		/* If we previously pinned the buffer, it must surely be valid */
 		result = true;
 	}
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 	return result;
@@ -1143,12 +1430,15 @@ static void
 PinBuffer_Locked(volatile BufferDesc *buf)
 {
 	int			b = buf->buf_id;
+	PrivateRefCountEntry *ref;
+
+	ref = GetPrivateRefCountEntry(b + 1, true, true);
 
-	if (PrivateRefCount[b] == 0)
+	if (ref->refcount == 0)
 		buf->refcount++;
 	UnlockBufHdr(buf);
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
+	ref->refcount++;
+	Assert(ref->refcount > 0);
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
 }
@@ -1164,15 +1454,19 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 static void
 UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 {
+	PrivateRefCountEntry *ref;
 	int			b = buf->buf_id;
 
+	ref = GetPrivateRefCountEntry(b + 1, false, false);
+	Assert(ref != NULL);
+
 	if (fixOwner)
 		ResourceOwnerForgetBuffer(CurrentResourceOwner,
 								  BufferDescriptorGetBuffer(buf));
 
-	Assert(PrivateRefCount[b] > 0);
-	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0)
+	Assert(ref->refcount > 0);
+	ref->refcount--;
+	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
@@ -1197,6 +1491,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		}
 		else
 			UnlockBufHdr(buf);
+
+		ForgetPrivateRefCountEntry(ref);
 	}
 }
 
@@ -1702,6 +1998,10 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 
 /*
  *		AtEOXact_Buffers - clean up at end of transaction.
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
  */
 void
 AtEOXact_Buffers(bool isCommit)
@@ -1709,6 +2009,36 @@ AtEOXact_Buffers(bool isCommit)
 	CheckForBufferLeaks();
 
 	AtEOXact_LocalBuffers(isCommit);
+
+	Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster).  It sets up for this backend's access to the already-existing
+ * buffer pool.
+ *
+ * NB: this is called before InitProcess(), so we do not have a PGPROC and
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
+ * shared memory yet.  We are only initializing local data here.
+ * (See also InitBufferPoolBackend)
+ */
+void
+InitBufferPoolAccess(void)
+{
+	HASHCTL		hash_ctl;
+
+	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(int32);
+	hash_ctl.entrysize = sizeof(PrivateRefCountArray);
+	hash_ctl.hash = oid_hash; /* a bit more efficient than tag_hash */
+
+	PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION);
 }
 
 /*
@@ -1754,16 +2084,34 @@ CheckForBufferLeaks(void)
 {
 #ifdef USE_ASSERT_CHECKING
 	int			RefCountErrors = 0;
-	Buffer		b;
+	PrivateRefCountEntry *res;
+	int			i;
+
+	/* check the array */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer != InvalidBuffer)
+		{
+			PrintBufferLeakWarning(res->buffer);
+			RefCountErrors++;
+		}
+	}
 
-	for (b = 1; b <= NBuffers; b++)
+	/* if neccessary search the hash */
+	if (PrivateRefCountOverflowed)
 	{
-		if (PrivateRefCount[b - 1] != 0)
+		HASH_SEQ_STATUS hstat;
+		hash_seq_init(&hstat, PrivateRefCountHash);
+		while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
 		{
-			PrintBufferLeakWarning(b);
+			PrintBufferLeakWarning(res->buffer);
 			RefCountErrors++;
 		}
+
 	}
+
 	Assert(RefCountErrors == 0);
 #endif
 }
@@ -1789,7 +2137,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	else
 	{
 		buf = &BufferDescriptors[buffer - 1];
-		loccount = PrivateRefCount[buffer - 1];
+		loccount = GetPrivateRefCount(buffer);
 		backend = InvalidBackendId;
 	}
 
@@ -2329,7 +2677,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, PrivateRefCount[i]);
+			 buf->refcount, GetPrivateRefCount(i));
 	}
 }
 #endif
@@ -2343,7 +2691,7 @@ PrintPinnedBufs(void)
 
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
-		if (PrivateRefCount[i] > 0)
+		if (GetPrivateRefCount(i + 1) > 0)
 		{
 			/* theoretically we should lock the bufhdr here */
 			elog(LOG,
@@ -2352,7 +2700,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpath(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, PrivateRefCount[i]);
+				 buf->refcount, GetPrivateRefCount(i + 1));
 		}
 	}
 }
@@ -2509,6 +2857,7 @@ void
 ReleaseBuffer(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	PrivateRefCountEntry *ref;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -2524,10 +2873,12 @@ ReleaseBuffer(Buffer buffer)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	ref = GetPrivateRefCountEntry(buffer, false, false);
+	Assert(ref != NULL);
+	Assert(ref->refcount > 0);
 
-	if (PrivateRefCount[buffer - 1] > 1)
-		PrivateRefCount[buffer - 1]--;
+	if (ref->refcount > 1)
+		ref->refcount--;
 	else
 		UnpinBuffer(bufHdr, false);
 }
@@ -2561,7 +2912,12 @@ IncrBufferRefCount(Buffer buffer)
 	if (BufferIsLocal(buffer))
 		LocalRefCount[-buffer - 1]++;
 	else
-		PrivateRefCount[buffer - 1]++;
+	{
+		PrivateRefCountEntry *ref;
+		ref = GetPrivateRefCountEntry(buffer, false, true);
+		Assert(ref != NULL);
+		ref->refcount++;
+	}
 }
 
 /*
@@ -2595,7 +2951,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
-	Assert(PrivateRefCount[buffer - 1] > 0);
+	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
 	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
@@ -2813,9 +3169,9 @@ LockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	if (PrivateRefCount[buffer - 1] != 1)
+	if (GetPrivateRefCount(buffer) != 1)
 		elog(ERROR, "incorrect local pin count: %d",
-			 PrivateRefCount[buffer - 1]);
+			 GetPrivateRefCount(buffer));
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
@@ -2880,7 +3236,7 @@ HoldingBufferPinThatDelaysRecovery(void)
 	if (bufid < 0)
 		return false;
 
-	if (PrivateRefCount[bufid] > 0)
+	if (GetPrivateRefCount(bufid + 1) > 0)
 		return true;
 
 	return false;
@@ -2910,8 +3266,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 	}
 
 	/* There should be exactly one local pin */
-	Assert(PrivateRefCount[buffer - 1] > 0);
-	if (PrivateRefCount[buffer - 1] != 1)
+	Assert(GetPrivateRefCount(buffer) > 0);
+	if (GetPrivateRefCount(buffer) != 1)
 		return false;
 
 	/* Try to acquire lock */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..42d9120 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -55,7 +55,6 @@ extern int	target_prefetch_pages;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
-extern PGDLLIMPORT int32 *PrivateRefCount;
 
 /* in localbuf.c */
 extern PGDLLIMPORT int NLocBuffer;
@@ -102,24 +101,6 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 )
 
 /*
- * BufferIsPinned
- *		True iff the buffer is pinned (also checks for valid buffer number).
- *
- *		NOTE: what we check here is that *this* backend holds a pin on
- *		the buffer.  We do not care whether some other backend does.
- */
-#define BufferIsPinned(bufnum) \
-( \
-	!BufferIsValid(bufnum) ? \
-		false \
-	: \
-		BufferIsLocal(bufnum) ? \
-			(LocalRefCount[-(bufnum) - 1] > 0) \
-		: \
-			(PrivateRefCount[(bufnum) - 1] > 0) \
-)
-
-/*
  * BufferGetBlock
  *		Returns a reference to a disk page image associated with a buffer.
  *
-- 
2.0.0.rc2.4.g1dc51c6.dirty