Move PinBuffer and UnpinBuffer to atomics

Started by YUriy Zhuravlevover 10 years ago171 messages
#1YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
1 attachment(s)

Hello hackers!

Continuing the theme: /messages/by-id/3368228.mTSz6V0Jsq@dinodell

This time, we fairly rewrote 'refcount' and 'usage_count' to atomic in
PinBuffer and UnpinBuffer (but save lock for buffer flags in Unpin).

In the same time it doesn't affect to correctness of buffer manager
because that variables already have LWLock on top of them (for partition of
hashtable). If someone pinned buffer after the call StrategyGetBuffer we just
try again (in BufferAlloc). Also in the code there is one more check before
deleting the old buffer, where changes can be rolled back. The other functions
where it is checked 'refcount' and 'usage_count' put exclusive locks.

Also stress test with 256 KB shared memory ended successfully.

Without patch we have 417523 TPS and with patch 965821 TPS for big x86 server.
All details here: https://gist.github.com/stalkerg/773a81b79a27b4d5d63f

Thank you.
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

atomic_bufmgr_v5.patchtext/x-patch; charset=utf-8; name=atomic_bufmgr_v5.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 6622d22..50ca2a5 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -33,14 +33,14 @@ typedef struct
 	BlockNumber blocknum;
 	bool		isvalid;
 	bool		isdirty;
-	uint16		usagecount;
+	uint32		usagecount;
 
 	/*
 	 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
 	 * being pinned by too many backends and each backend will only pin once
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
-	int32		pinning_backends;
+	uint32		pinning_backends;
 } BufferCachePagesRec;
 
 
@@ -160,8 +160,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
 			fctx->record[i].forknum = bufHdr->tag.forkNum;
 			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = bufHdr->usage_count;
-			fctx->record[i].pinning_backends = bufHdr->refcount;
+			fctx->record[i].usagecount = pg_atomic_read_u32(&bufHdr->usage_count);
+			fctx->record[i].pinning_backends = pg_atomic_read_u32(&bufHdr->refcount);
 
 			if (bufHdr->flags & BM_DIRTY)
 				fctx->record[i].isdirty = true;
@@ -236,7 +236,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			values[7] = Int16GetDatum(fctx->record[i].usagecount);
 			nulls[7] = false;
 			/* unused for v1.0 callers, but the array is always long enough */
-			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+			values[8] = UInt32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
 		}
 
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..e139a7c 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -96,8 +96,8 @@ InitBufferPool(void)
 
 			CLEAR_BUFFERTAG(buf->tag);
 			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
+			pg_atomic_init_u32(&buf->usage_count, 0);
+			pg_atomic_init_u32(&buf->refcount, 0);
 			buf->wait_backend_pid = 0;
 
 			SpinLockInit(&buf->buf_hdr_lock);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8c0358e..afba360 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -962,7 +962,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * into the buffer.
 		 */
 		buf = GetBufferDescriptor(buf_id);
-
 		valid = PinBuffer(buf, strategy);
 
 		/* Can release the mapping lock as soon as we've pinned it */
@@ -1013,7 +1012,15 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		buf = StrategyGetBuffer(strategy);
 
-		Assert(buf->refcount == 0);
+		/*
+		 * Ok, we can skip this but then we have to remove new buffer from
+		 * hash table. Better to just try again.
+		 */
+		if (pg_atomic_read_u32(&buf->refcount) != 0)
+		{
+			UnlockBufHdr(buf);
+			continue;
+		}
 
 		/* Must copy buffer flags while we still hold the spinlock */
 		oldFlags = buf->flags;
@@ -1211,7 +1218,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * over with a new victim buffer.
 		 */
 		oldFlags = buf->flags;
-		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+		if (pg_atomic_read_u32(&buf->refcount) == 1 && !(oldFlags & BM_DIRTY))
 			break;
 
 		UnlockBufHdr(buf);
@@ -1234,10 +1241,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	buf->tag = newTag;
 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
-		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+		buf->flags|= BM_TAG_VALID | BM_PERMANENT;
 	else
 		buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 1;
+	pg_atomic_write_u32(&buf->usage_count, 1);
 
 	UnlockBufHdr(buf);
 
@@ -1329,7 +1336,7 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	if (pg_atomic_read_u32(&buf->refcount) != 0)
 	{
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
@@ -1347,7 +1354,7 @@ retry:
 	oldFlags = buf->flags;
 	CLEAR_BUFFERTAG(buf->tag);
 	buf->flags = 0;
-	buf->usage_count = 0;
+	pg_atomic_write_u32(&buf->usage_count, 0);
 
 	UnlockBufHdr(buf);
 
@@ -1399,7 +1406,7 @@ MarkBufferDirty(Buffer buffer)
 
 	LockBufHdr(bufHdr);
 
-	Assert(bufHdr->refcount > 0);
+	Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
 
 	/*
 	 * If the buffer was not dirty already, do vacuum accounting.
@@ -1498,20 +1505,23 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		ReservePrivateRefCountEntry();
 		ref = NewPrivateRefCountEntry(b);
 
-		LockBufHdr(buf);
-		buf->refcount++;
+		pg_atomic_add_fetch_u32(&buf->refcount, 1);
+
 		if (strategy == NULL)
 		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
+			uint32 expect = pg_atomic_read_u32(&buf->usage_count);
+			while (expect < BM_MAX_USAGE_COUNT)
+			{
+				if (pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, expect+1))
+					break;
+			}
 		}
 		else
 		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
+			uint32 expect = 0;
+			pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, 1);
 		}
 		result = (buf->flags & BM_VALID) != 0;
-		UnlockBufHdr(buf);
 	}
 	else
 	{
@@ -1558,7 +1568,7 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
-	buf->refcount++;
+	pg_atomic_add_fetch_u32(&buf->refcount, 1);
 	UnlockBufHdr(buf);
 
 	b = BufferDescriptorGetBuffer(buf);
@@ -1598,15 +1608,14 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
 
-		LockBufHdr(buf);
-
 		/* Decrement the shared reference count */
-		Assert(buf->refcount > 0);
-		buf->refcount--;
+		Assert(pg_atomic_read_u32(&buf->refcount) > 0);
+		pg_atomic_sub_fetch_u32(&buf->refcount, 1);
 
+		LockBufHdr(buf);
 		/* Support LockBufferForCleanup() */
 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
-			buf->refcount == 1)
+			pg_atomic_read_u32(&buf->refcount) == 1)
 		{
 			/* we just released the last pin other than the waiter's */
 			int			wait_backend_pid = buf->wait_backend_pid;
@@ -2095,7 +2104,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 */
 	LockBufHdr(bufHdr);
 
-	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+	if (pg_atomic_read_u32(&bufHdr->refcount) == 0 && pg_atomic_read_u32(&bufHdr->usage_count) == 0)
 		result |= BUF_REUSABLE;
 	else if (skip_recently_used)
 	{
@@ -2278,7 +2287,7 @@ PrintBufferLeakWarning(Buffer buffer)
 		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
 		 buffer, path,
 		 buf->tag.blockNum, buf->flags,
-		 buf->refcount, loccount);
+		 pg_atomic_read_u32(&buf->refcount), loccount);
 	pfree(path);
 }
 
@@ -2809,7 +2818,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, GetPrivateRefCount(b));
+			 pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
 	}
 }
 #endif
@@ -2834,7 +2843,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpathperm(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, GetPrivateRefCount(b));
+				 pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
 		}
 	}
 }
@@ -3149,7 +3158,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		}
 
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
+		Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
 		if (!(bufHdr->flags & BM_DIRTY))
 		{
 			dirtied = true;		/* Means "will be dirtied by this action" */
@@ -3307,8 +3316,8 @@ LockBufferForCleanup(Buffer buffer)
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (bufHdr->refcount == 1)
+		Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+		if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
 			UnlockBufHdr(bufHdr);
@@ -3417,8 +3426,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 	LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
-	if (bufHdr->refcount == 1)
+	Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+	if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
 	{
 		/* Successfully acquired exclusive lock with pincount 1 */
 		UnlockBufHdr(bufHdr);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index bc2c773..4461271 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -280,14 +280,13 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * of 8.3, but we'd better check anyway.)
 			 */
 			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
 				return buf;
 			}
 			UnlockBufHdr(buf);
-
 		}
 	}
 
@@ -303,11 +302,11 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
 		LockBufHdr(buf);
-		if (buf->refcount == 0)
+		if (pg_atomic_read_u32(&buf->refcount) == 0)
 		{
-			if (buf->usage_count > 0)
+			if (buf->usage_count.value > 0)
 			{
-				buf->usage_count--;
+				buf->usage_count.value--;
 				trycounter = NBuffers;
 			}
 			else
@@ -617,7 +616,7 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
 	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) <= 1)
 	{
 		strategy->current_was_in_ring = true;
 		return buf;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3144afe..e1932f5 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -131,8 +131,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			if (pg_atomic_read_u32(&bufHdr->usage_count) < BM_MAX_USAGE_COUNT)
+				pg_atomic_add_fetch_u32(&bufHdr->usage_count, 1);
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
@@ -169,9 +169,9 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			if (pg_atomic_read_u32(&bufHdr->usage_count) > 0)
 			{
-				bufHdr->usage_count--;
+				pg_atomic_fetch_sub_u32(&bufHdr->usage_count, 1);
 				trycounter = NLocBuffer;
 			}
 			else
@@ -252,7 +252,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	bufHdr->tag = newTag;
 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	pg_atomic_write_u32(&bufHdr->usage_count, 1);
 
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -328,7 +328,7 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
 			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			pg_atomic_write_u32(&bufHdr->usage_count, 0);
 		}
 	}
 }
@@ -368,7 +368,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
 			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			pg_atomic_write_u32(&bufHdr->usage_count, 0);
 		}
 	}
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..68cbbf4 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -137,9 +137,9 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
+	BufFlags		flags;			/* see bit definitions above */
+	pg_atomic_uint32		usage_count;	/* usage counter for clock sweep code */
+	pg_atomic_uint32		refcount;		/* # of backends holding pins on buffer */
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
 	slock_t		buf_hdr_lock;	/* protects the above fields */
#2Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#1)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 2015-09-11 13:23:24 +0300, YUriy Zhuravlev wrote:

Continuing the theme: /messages/by-id/3368228.mTSz6V0Jsq@dinodell

Please don't just start new threads for a new version of the patch.

This time, we fairly rewrote 'refcount' and 'usage_count' to atomic in
PinBuffer and UnpinBuffer (but save lock for buffer flags in Unpin).

Hm.

In the same time it doesn't affect to correctness of buffer manager
because that variables already have LWLock on top of them (for partition of
hashtable).

Note that there's a pending patch that removes the buffer mapping locks
entirely.

If someone pinned buffer after the call StrategyGetBuffer we just try
again (in BufferAlloc). Also in the code there is one more check
before deleting the old buffer, where changes can be rolled back. The
other functions where it is checked 'refcount' and 'usage_count' put
exclusive locks.

I don't think this is correct. This way we can leave the for (;;) loop
in BufferAlloc() thinking that the buffer is unused (and can't be further
pinned because of the held spinlock!) while it actually has been pinned
since by PinBuffer(). Additionally oldFlags can get out of sync there.

I don't think the approach of making some of the fields atomics but not
really caring about the rest is going to work. My suggestion is to add a
single 'state' 32bit atomic. This 32bit state is subdivided into:

10bit for flags,
3bit for usage_count,
16bit for refcount

then turn each operation that currently uses one of these fields into
corresponding accesses (just different values for flags, bit-shiftery &
mask for reading usage count, bit mask for reading refcount). The trick
then is to add a *new* flag value BM_LOCKED. This can then act as a sort
of a 'one bit' spinlock.

That should roughly look like (more or less pseudocode):

void
LockBufHdr(BufferDesc *desc)
{
int state = pg_atomic_read_u32(&desc->state);

for (;;)
{
/* wait till lock is free */
while (unlikely(state & BM_LOCKED))
{
pg_spin_delay();
state = pg_atomic_read_u32(&desc->state);

/* add exponential backoff? Should seldomly be contended tho. */
}

/* and try to get lock */
if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
break;
}
}

static bool
PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
{
...
if (ref == NULL)
{
ReservePrivateRefCountEntry();
ref = NewPrivateRefCountEntry(b);
...

int state = pg_atomic_read_u32(&desc->state);
int oldstate = state;

while (true)
{

/* spin-wait till lock is free */
while (unlikely(state & BM_LOCKED))
{
pg_spin_delay();
state = pg_atomic_read_u32(&desc->state);
}

/* increase refcount */
state += 1;

/* increase usagecount unless already max */
if ((state & USAGE_COUNT_MASK) != BM_MAX_USAGE_COUNT)
state += BM_USAGE_COUNT_ONE;

result = (state & BM_VALID) != 0;

if (pg_atomic_compare_exchange_u32(&desc->state, &oldstate, state))
break;

/* get ready for next loop, oldstate has been updated by cas */
state = oldstate;
}
...
}

other callsites can either just plainly continue to use
LockBufHdr/UnlockBufHdr or converted similarly to PinBuffer().

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3David Rowley
david.rowley@2ndquadrant.com
In reply to: YUriy Zhuravlev (#1)
Re: Move PinBuffer and UnpinBuffer to atomics

On 11 September 2015 at 22:23, YUriy Zhuravlev <u.zhuravlev@postgrespro.ru>
wrote:

Without patch we have 417523 TPS and with patch 965821 TPS for big x86
server.
All details here: https://gist.github.com/stalkerg/773a81b79a27b4d5d63f

Impressive!

I've run this on a single CPU server and don't see any speedup, so I assume
I'm not getting enough contention.
As soon as our 4 socket machine is free I'll try a pgbench run with that.

Just for fun, what's the results if you use -M prepared ?

Regards

David Rowley

--
David Rowley http://www.2ndQuadrant.com/
<http://www.2ndquadrant.com/&gt;
PostgreSQL Development, 24x7 Support, Training & Services

#4YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#2)
Re: Move PinBuffer and UnpinBuffer to atomics

On Friday 11 September 2015 18:14:21 Andres Freund wrote:

This way we can leave the for (;;) loop
in BufferAlloc() thinking that the buffer is unused (and can't be further
pinned because of the held spinlock!)

We lost lock after PinBuffer_Locked in BufferAlloc. Therefore, in essence,
nothing has changed.
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#4)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-11 19:33:26 +0300, YUriy Zhuravlev wrote:

On Friday 11 September 2015 18:14:21 Andres Freund wrote:

This way we can leave the for (;;) loop
in BufferAlloc() thinking that the buffer is unused (and can't be further
pinned because of the held spinlock!)

We lost lock after PinBuffer_Locked in BufferAlloc. Therefore, in essence,
nothing has changed.

The relevant piece of code is:
/*
* Need to lock the buffer header too in order to change its tag.
*/
LockBufHdr(buf);

/*
* Somebody could have pinned or re-dirtied the buffer while we were
* doing the I/O and making the new hashtable entry. If so, we can't
* recycle this buffer; we must undo everything we've done and start
* over with a new victim buffer.
*/
oldFlags = buf->flags;
if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
break;

UnlockBufHdr(buf);
BufTableDelete(&newTag, newHash);
if ((oldFlags & BM_TAG_VALID) &&
oldPartitionLock != newPartitionLock)
LWLockRelease(oldPartitionLock);
LWLockRelease(newPartitionLock);
UnpinBuffer(buf, true);
}

/*
* Okay, it's finally safe to rename the buffer.
*
* Clearing BM_VALID here is necessary, clearing the dirtybits is just
* paranoia. We also reset the usage_count since any recency of use of
* the old content is no longer relevant. (The usage_count starts out at
* 1 so that the buffer can survive one clock-sweep pass.)
*/
buf->tag = newTag;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
else
buf->flags |= BM_TAG_VALID;
buf->usage_count = 1;

UnlockBufHdr(buf);

so unless I'm missing something, no, we haven't lost the lock.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#5)
Re: Move PinBuffer and UnpinBuffer to atomics

On Friday 11 September 2015 18:37:00 you wrote:

so unless I'm missing something, no, we haven't lost the lock.

This section is protected by like LWLockAcquire(newPartitionLock,
LW_EXCLUSIVE); before it (and we can't get this buffer from hash table).

--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#6)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-11 19:46:02 +0300, YUriy Zhuravlev wrote:

On Friday 11 September 2015 18:37:00 you wrote:

so unless I'm missing something, no, we haven't lost the lock.

This section is protected by like LWLockAcquire(newPartitionLock,
LW_EXCLUSIVE); before it (and we can't get this buffer from hash table).

a) As I said upthread there's a patch to remove these locks entirely
b) It doesn't matter anyway. Not every pin goes through the buffer
mapping table. StrategyGetBuffer(), SyncOneBuffer(), ...

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#7)
Re: Move PinBuffer and UnpinBuffer to atomics

On Friday 11 September 2015 18:50:35 you wrote:

a) As I said upthread there's a patch to remove these locks entirely

It is very interesting. Could you provide a link? And it's not very good,
since there is a bottleneck PinBuffer / UnpinBuffer instead of LWLocks.

b) It doesn't matter anyway. Not every pin goes through the buffer
mapping table. StrategyGetBuffer(), SyncOneBuffer(), ...

StrategyGetBuffer call only from BufferAlloc .
SyncOneBuffer not problem too because:
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
And please read comment before LockBufHdr(bufHdr) in SyncOneBuffer.

We checked all functions with refcount and usage_count.

Thanks! ^_^
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#8)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-14 13:16:46 +0300, YUriy Zhuravlev wrote:

On Friday 11 September 2015 18:50:35 you wrote:

a) As I said upthread there's a patch to remove these locks entirely

It is very interesting. Could you provide a link?

http://archives.postgresql.org/message-id/CA%2BTgmoYE4t-Pt%2Bv08kMO5u_XN-HNKBWtfMgcUXEGBrQiVgdV9Q%40mail.gmail.com

And it's not very good,
since there is a bottleneck PinBuffer / UnpinBuffer instead of
LWLocks.

Where the bottleneck is entirely depends on your workload. If you have a
high cache replacement ratio the mapping partition locks are frequently
going to be held exclusively.

b) It doesn't matter anyway. Not every pin goes through the buffer
mapping table. StrategyGetBuffer(), SyncOneBuffer(), ...

StrategyGetBuffer call only from BufferAlloc .

It gets called without buffer mapping locks held. And it can (and
frequently will!) access all the buffers in the buffer pool.

SyncOneBuffer not problem too because:

PinBuffer_Locked(bufHdr);

Which you made ineffective because PinBuffer() doesn't take a lock
anymore. Mutual exclusion through locks only works if all participants
take the locks.

We checked all functions with refcount and usage_count.

Adding lockless behaviour by just taking out locks without analyzing the
whole isn't going to fly. You either need to provide backward
compatibility (a LockBuffer that provides actual exclusion) or you
actually need to go carefully through all relevant code and make it
lock-free.

I pointed out how you can actually make this safely lock-free giving you
the interesting code.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#9)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-14 17:41:42 +0200, Andres Freund wrote:

I pointed out how you can actually make this safely lock-free giving you
the interesting code.

And here's an actual implementation of that approach. It's definitely
work-in-progress and could easily be optimized further. Don't have any
big machines to play around with right now tho.

Andres

Attachments:

bufferpin.difftext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..3e70792 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -95,12 +95,9 @@ InitBufferPool(void)
 			BufferDesc *buf = GetBufferDescriptor(i);
 
 			CLEAR_BUFFERTAG(buf->tag);
-			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
-			buf->wait_backend_pid = 0;
 
-			SpinLockInit(&buf->buf_hdr_lock);
+			pg_atomic_init_u32(&buf->state, 0);
+			buf->wait_backend_pid = 0;
 
 			buf->buf_id = i;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8c0358e..345322a 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -51,6 +51,8 @@
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
+#define likely(x)       __builtin_expect((x),1)
+#define unlikely(x)     __builtin_expect((x),0)
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -774,9 +776,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		if (isLocalBuf)
 		{
+			uint32		state;
+
+			state = pg_atomic_read_u32(&bufHdr->state);
 			/* Only need to adjust flags */
-			Assert(bufHdr->flags & BM_VALID);
-			bufHdr->flags &= ~BM_VALID;
+			Assert(state & BM_VALID);
+			state &= ~BM_VALID;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 		else
 		{
@@ -788,8 +794,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			do
 			{
 				LockBufHdr(bufHdr);
-				Assert(bufHdr->flags & BM_VALID);
-				bufHdr->flags &= ~BM_VALID;
+				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
+				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
 				UnlockBufHdr(bufHdr);
 			} while (!StartBufferIO(bufHdr, true));
 		}
@@ -807,7 +813,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * it's not been recycled) but come right back here to try smgrextend
 	 * again.
 	 */
-	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
+	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
 
 	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
@@ -885,7 +891,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if (isLocalBuf)
 	{
 		/* Only need to adjust flags */
-		bufHdr->flags |= BM_VALID;
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
 	}
 	else
 	{
@@ -939,7 +945,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferTag	oldTag;			/* previous identity of selected buffer */
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
 	int			buf_id;
 	volatile BufferDesc *buf;
 	bool		valid;
@@ -1013,10 +1019,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		buf = StrategyGetBuffer(strategy);
 
-		Assert(buf->refcount == 0);
+		Assert((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 0);
 
 		/* Must copy buffer flags while we still hold the spinlock */
-		oldFlags = buf->flags;
+		oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
 
 		/* Pin the buffer and then release the buffer spinlock */
 		PinBuffer_Locked(buf);
@@ -1210,8 +1216,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * recycle this buffer; we must undo everything we've done and start
 		 * over with a new victim buffer.
 		 */
-		oldFlags = buf->flags;
-		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+		oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
+		if ((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 1 &&
+			!(oldFlags & BM_DIRTY))
 			break;
 
 		UnlockBufHdr(buf);
@@ -1232,12 +1239,19 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
-	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
+	pg_atomic_fetch_and_u32(&buf->state,
+							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
+							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
+							  BM_PERMANENT |
+							  BUF_USAGECOUNT_MASK));
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
-		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+		pg_atomic_fetch_or_u32(&buf->state,
+							   BM_TAG_VALID | BM_PERMANENT |
+							   BUF_USAGECOUNT_ONE);
 	else
-		buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 1;
+		pg_atomic_fetch_or_u32(&buf->state,
+							   BM_TAG_VALID |
+							   BUF_USAGECOUNT_ONE);
 
 	UnlockBufHdr(buf);
 
@@ -1286,7 +1300,7 @@ InvalidateBuffer(volatile BufferDesc *buf)
 	BufferTag	oldTag;
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
 
 	/* Save the original buffer tag before dropping the spinlock */
 	oldTag = buf->tag;
@@ -1329,7 +1343,7 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	if ((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) != 0)
 	{
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
@@ -1344,10 +1358,9 @@ retry:
 	 * Clear out the buffer's tag and flags.  We must do this to ensure that
 	 * linear scans of the buffer array don't think the buffer is valid.
 	 */
-	oldFlags = buf->flags;
+	oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
 	CLEAR_BUFFERTAG(buf->tag);
-	buf->flags = 0;
-	buf->usage_count = 0;
+	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
 
 	UnlockBufHdr(buf);
 
@@ -1399,12 +1412,12 @@ MarkBufferDirty(Buffer buffer)
 
 	LockBufHdr(bufHdr);
 
-	Assert(bufHdr->refcount > 0);
+	Assert((pg_atomic_read_u32(&bufHdr->state) & BUF_REFCOUNT_MASK) > 0);
 
 	/*
 	 * If the buffer was not dirty already, do vacuum accounting.
 	 */
-	if (!(bufHdr->flags & BM_DIRTY))
+	if (!(pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY))
 	{
 		VacuumPageDirty++;
 		pgBufferUsage.shared_blks_dirtied++;
@@ -1412,7 +1425,8 @@ MarkBufferDirty(Buffer buffer)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
 
-	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+	pg_atomic_fetch_or_u32(&bufHdr->state,
+						   BM_DIRTY | BM_JUST_DIRTIED);
 
 	UnlockBufHdr(bufHdr);
 }
@@ -1495,23 +1509,39 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 
 	if (ref == NULL)
 	{
+		uint32 state;
+		uint32 oldstate;
+
 		ReservePrivateRefCountEntry();
 		ref = NewPrivateRefCountEntry(b);
 
-		LockBufHdr(buf);
-		buf->refcount++;
-		if (strategy == NULL)
-		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
-		}
-		else
+		state = pg_atomic_read_u32(&buf->state);
+		oldstate = state;
+
+		while (true)
 		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
+			/* spin-wait till lock is free */
+			while (unlikely(state & BM_LOCKED))
+			{
+				pg_spin_delay();
+				state = pg_atomic_read_u32(&buf->state);
+			}
+
+			/* increase refcount */
+			state += 1;
+
+			/* increase usagecount unless already max */
+			if (((state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) != BM_MAX_USAGE_COUNT)
+				state += BUF_USAGECOUNT_ONE;
+
+			result = (state & BM_VALID) != 0;
+
+			if (likely(pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state)))
+				break;
+
+			/* get ready for next loop, oldstate has been updated by cas */
+			state = oldstate;
 		}
-		result = (buf->flags & BM_VALID) != 0;
-		UnlockBufHdr(buf);
 	}
 	else
 	{
@@ -1558,7 +1588,7 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
-	buf->refcount++;
+	pg_atomic_fetch_add_u32(&buf->state, 1);
 	UnlockBufHdr(buf);
 
 	b = BufferDescriptorGetBuffer(buf);
@@ -1594,30 +1624,41 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 	ref->refcount--;
 	if (ref->refcount == 0)
 	{
+		uint32 state;
+
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
 
-		LockBufHdr(buf);
-
-		/* Decrement the shared reference count */
-		Assert(buf->refcount > 0);
-		buf->refcount--;
+		/*
+		 * Decrement the shared reference count.
+		 *
+		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
+		 * currently all manipulation of ->state for shared buffers is through
+		 * atomics.
+		 */
+		state = pg_atomic_fetch_sub_u32(&buf->state, 1);
+		Assert((state & BUF_REFCOUNT_MASK) > 0);
 
 		/* Support LockBufferForCleanup() */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
-			buf->refcount == 1)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
-			/* we just released the last pin other than the waiter's */
-			int			wait_backend_pid = buf->wait_backend_pid;
+			LockBufHdr(buf);
 
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
-			UnlockBufHdr(buf);
-			ProcSendSignal(wait_backend_pid);
-		}
-		else
-			UnlockBufHdr(buf);
+			if (pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER &&
+				(pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 1)
+			{
+				/* we just released the last pin other than the waiter's */
+				int			wait_backend_pid = buf->wait_backend_pid;
 
+				pg_atomic_fetch_and_u32(&buf->state,
+										~BM_PIN_COUNT_WAITER);
+				UnlockBufHdr(buf);
+				ProcSendSignal(wait_backend_pid);
+			}
+			else
+				UnlockBufHdr(buf);
+		}
 		ForgetPrivateRefCountEntry(ref);
 	}
 }
@@ -1680,9 +1721,10 @@ BufferSync(int flags)
 		 */
 		LockBufHdr(bufHdr);
 
-		if ((bufHdr->flags & mask) == mask)
+		if ((pg_atomic_read_u32(&bufHdr->state) & mask) == mask)
 		{
-			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+			pg_atomic_fetch_or_u32(&bufHdr->state,
+								   BM_CHECKPOINT_NEEDED);
 			num_to_write++;
 		}
 
@@ -1721,7 +1763,7 @@ BufferSync(int flags)
 		 * write the buffer though we didn't need to.  It doesn't seem worth
 		 * guarding against this, though.
 		 */
-		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
 		{
 			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
 			{
@@ -2081,6 +2123,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 {
 	volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 	int			result = 0;
+	uint32		state;
 
 	ReservePrivateRefCountEntry();
 
@@ -2095,7 +2138,10 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 */
 	LockBufHdr(bufHdr);
 
-	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+	state = pg_atomic_read_u32(&bufHdr->state);
+
+	if ((state & BUF_REFCOUNT_MASK) == 0 &&
+		(state & BUF_USAGECOUNT_MASK) == 0)
 		result |= BUF_REUSABLE;
 	else if (skip_recently_used)
 	{
@@ -2104,7 +2150,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 		return result;
 	}
 
-	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
+	if (!(state & BM_VALID) || !(state & BM_DIRTY))
 	{
 		/* It's clean, so nothing to do */
 		UnlockBufHdr(bufHdr);
@@ -2256,6 +2302,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	int32		loccount;
 	char	   *path;
 	BackendId	backend;
+	uint32		state;
 
 	Assert(BufferIsValid(buffer));
 	if (BufferIsLocal(buffer))
@@ -2273,12 +2320,13 @@ PrintBufferLeakWarning(Buffer buffer)
 
 	/* theoretically we should lock the bufhdr here */
 	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+	state = pg_atomic_read_u32(&buf->state);
 	elog(WARNING,
 		 "buffer refcount leak: [%03d] "
 		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
 		 buffer, path,
-		 buf->tag.blockNum, buf->flags,
-		 buf->refcount, loccount);
+		 buf->tag.blockNum, state & BUF_FLAG_MASK,
+		 state & BUF_REFCOUNT_MASK, loccount);
 	pfree(path);
 }
 
@@ -2424,7 +2472,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	recptr = BufferGetLSN(buf);
 
 	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf->flags &= ~BM_JUST_DIRTIED;
+	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
 	UnlockBufHdr(buf);
 
 	/*
@@ -2444,7 +2492,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
 	 */
-	if (buf->flags & BM_PERMANENT)
+	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
 		XLogFlush(recptr);
 
 	/*
@@ -2538,7 +2586,7 @@ BufferIsPermanent(Buffer buffer)
 	 * old value or the new value, but not random garbage.
 	 */
 	bufHdr = GetBufferDescriptor(buffer - 1);
-	return (bufHdr->flags & BM_PERMANENT) != 0;
+	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
 }
 
 /*
@@ -2874,7 +2922,8 @@ FlushRelationBuffers(Relation rel)
 		{
 			bufHdr = GetLocalBufferDescriptor(i);
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+				== (BM_VALID | BM_DIRTY))
 			{
 				ErrorContextCallback errcallback;
 				Page		localpage;
@@ -2895,7 +2944,7 @@ FlushRelationBuffers(Relation rel)
 						  localpage,
 						  false);
 
-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
@@ -2923,7 +2972,8 @@ FlushRelationBuffers(Relation rel)
 
 		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+			== (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
@@ -2975,7 +3025,8 @@ FlushDatabaseBuffers(Oid dbid)
 
 		LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+			== (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
@@ -3093,12 +3144,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	 * is only intended to be used in cases where failing to write out the
 	 * data would be harmless anyway, it doesn't really matter.
 	 */
-	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
 		XLogRecPtr	lsn = InvalidXLogRecPtr;
 		bool		dirtied = false;
 		bool		delayChkpt = false;
+		uint32		state;
 
 		/*
 		 * If we need to protect hint bit updates from torn writes, WAL-log a
@@ -3109,7 +3161,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		 * We don't check full_page_writes here because that logic is included
 		 * when we call XLogInsert() since the value changes dynamically.
 		 */
-		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
+		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
 			 * If we're in recovery we cannot dirty a page because of a hint.
@@ -3149,8 +3201,12 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		}
 
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (!(bufHdr->flags & BM_DIRTY))
+
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		Assert((state & BUF_REFCOUNT_MASK) > 0);
+
+		if (!(state & BM_DIRTY))
 		{
 			dirtied = true;		/* Means "will be dirtied by this action" */
 
@@ -3170,7 +3226,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			if (!XLogRecPtrIsInvalid(lsn))
 				PageSetLSN(page, lsn);
 		}
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
+
 		UnlockBufHdr(bufHdr);
 
 		if (delayChkpt)
@@ -3208,9 +3266,9 @@ UnlockBuffers(void)
 		 * Don't complain if flag bit not set; it could have been reset but we
 		 * got a cancel/die interrupt before getting the signal.
 		 */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
 			buf->wait_backend_pid == MyProcPid)
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
+			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
 
 		UnlockBufHdr(buf);
 
@@ -3304,25 +3362,30 @@ LockBufferForCleanup(Buffer buffer)
 
 	for (;;)
 	{
+		int		state;
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (bufHdr->refcount == 1)
+
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		Assert((state & BUF_REFCOUNT_MASK) > 0);
+		if ((state & BUF_REFCOUNT_MASK) == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
 			UnlockBufHdr(bufHdr);
 			return;
 		}
 		/* Failed, so mark myself as waiting for pincount 1 */
-		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
 			UnlockBufHdr(bufHdr);
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			elog(ERROR, "multiple backends attempting to wait for pincount 1");
 		}
 		bufHdr->wait_backend_pid = MyProcPid;
-		bufHdr->flags |= BM_PIN_COUNT_WAITER;
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
 		PinCountWaitBuf = bufHdr;
 		UnlockBufHdr(bufHdr);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -3349,9 +3412,9 @@ LockBufferForCleanup(Buffer buffer)
 		 * better be safe.
 		 */
 		LockBufHdr(bufHdr);
-		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
 			bufHdr->wait_backend_pid == MyProcPid)
-			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
+			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
 		UnlockBufHdr(bufHdr);
 
 		PinCountWaitBuf = NULL;
@@ -3393,22 +3456,25 @@ bool
 ConditionalLockBufferForCleanup(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	uint32		refcount;
 
 	Assert(BufferIsValid(buffer));
 
 	if (BufferIsLocal(buffer))
 	{
+		refcount = LocalRefCount[-buffer - 1];
 		/* There should be exactly one pin */
-		Assert(LocalRefCount[-buffer - 1] > 0);
-		if (LocalRefCount[-buffer - 1] != 1)
+		Assert(refcount > 0);
+		if (refcount != 1)
 			return false;
 		/* Nobody else to wait for */
 		return true;
 	}
 
 	/* There should be exactly one local pin */
-	Assert(GetPrivateRefCount(buffer) > 0);
-	if (GetPrivateRefCount(buffer) != 1)
+	refcount = GetPrivateRefCount(buffer);
+	Assert(refcount);
+	if (refcount != 1)
 		return false;
 
 	/* Try to acquire lock */
@@ -3417,8 +3483,10 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 	LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
-	if (bufHdr->refcount == 1)
+
+	refcount = pg_atomic_read_u32(&bufHdr->state) & BUF_REFCOUNT_MASK;
+	Assert(refcount > 0);
+	if (refcount == 1)
 	{
 		/* Successfully acquired exclusive lock with pincount 1 */
 		UnlockBufHdr(bufHdr);
@@ -3456,7 +3524,7 @@ WaitIO(volatile BufferDesc *buf)
 	 */
 	for (;;)
 	{
-		BufFlags	sv_flags;
+		uint32		state;
 
 		/*
 		 * It may not be necessary to acquire the spinlock to check the flag
@@ -3464,9 +3532,10 @@ WaitIO(volatile BufferDesc *buf)
 		 * play it safe.
 		 */
 		LockBufHdr(buf);
-		sv_flags = buf->flags;
+		state = pg_atomic_read_u32(&buf->state);
 		UnlockBufHdr(buf);
-		if (!(sv_flags & BM_IO_IN_PROGRESS))
+
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
 		LWLockRelease(buf->io_in_progress_lock);
@@ -3494,6 +3563,8 @@ WaitIO(volatile BufferDesc *buf)
 static bool
 StartBufferIO(volatile BufferDesc *buf, bool forInput)
 {
+	uint32		state;
+
 	Assert(!InProgressBuf);
 
 	for (;;)
@@ -3506,7 +3577,9 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 
 		LockBufHdr(buf);
 
-		if (!(buf->flags & BM_IO_IN_PROGRESS))
+		state = pg_atomic_read_u32(&buf->state);
+
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 
 		/*
@@ -3522,7 +3595,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 
 	/* Once we get here, there is definitely no I/O active on this buffer */
 
-	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
+	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
@@ -3530,7 +3603,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
 		return false;
 	}
 
-	buf->flags |= BM_IO_IN_PROGRESS;
+	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
 
 	UnlockBufHdr(buf);
 
@@ -3565,11 +3638,13 @@ TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
 
 	LockBufHdr(buf);
 
-	Assert(buf->flags & BM_IO_IN_PROGRESS);
-	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
-	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
-	buf->flags |= set_flag_bits;
+	Assert(pg_atomic_read_u32(&buf->state) & BM_IO_IN_PROGRESS);
+
+	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
+	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
+		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
+
+	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
 
 	UnlockBufHdr(buf);
 
@@ -3603,23 +3678,24 @@ AbortBufferIO(void)
 		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
 
 		LockBufHdr(buf);
-		Assert(buf->flags & BM_IO_IN_PROGRESS);
+		Assert(pg_atomic_read_u32(&buf->state) & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
-			Assert(!(buf->flags & BM_DIRTY));
+			Assert(!(pg_atomic_read_u32(&buf->state) & BM_DIRTY));
+
 			/* We'd better not think buffer is valid yet */
-			Assert(!(buf->flags & BM_VALID));
+			Assert(!(pg_atomic_read_u32(&buf->state) & BM_VALID));
 			UnlockBufHdr(buf);
 		}
 		else
 		{
-			BufFlags	sv_flags;
+			uint32		state;
 
-			sv_flags = buf->flags;
-			Assert(sv_flags & BM_DIRTY);
+			state = pg_atomic_read_u32(&buf->state);
+			Assert(state & BM_DIRTY);
 			UnlockBufHdr(buf);
 			/* Issue notice if this is not the first failure... */
-			if (sv_flags & BM_IO_ERROR)
+			if (state & BM_IO_ERROR)
 			{
 				/* Buffer is pinned, so we can read tag without spinlock */
 				char	   *path;
@@ -3701,3 +3777,33 @@ rnode_comparator(const void *p1, const void *p2)
 	else
 		return 0;
 }
+
+void
+LockBufHdr(volatile BufferDesc *desc)
+{
+	uint32 state = pg_atomic_read_u32(&desc->state);
+
+	for (;;)
+	{
+		/* wait till lock is free */
+		while (unlikely(state & BM_LOCKED))
+		{
+			pg_spin_delay();
+			state = pg_atomic_read_u32(&desc->state);
+
+			/* Add exponential backoff? Should seldomly be contended tho. */
+		}
+
+		/* and try to get lock */
+		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+			break;
+	}
+}
+
+void
+UnlockBufHdr(volatile BufferDesc *desc)
+{
+	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+
+	pg_atomic_fetch_sub_u32(&desc->state, BM_LOCKED);
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index bc2c773..3f2227b 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -250,6 +250,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	{
 		while (true)
 		{
+			uint32	state;
+
 			/* Acquire the spinlock to remove element from the freelist */
 			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
@@ -280,7 +282,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * of 8.3, but we'd better check anyway.)
 			 */
 			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			state = pg_atomic_read_u32(&buf->state);
+			if ((state & BUF_REFCOUNT_MASK) == 0
+				&& (state & BUF_USAGECOUNT_MASK) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
@@ -295,6 +299,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	trycounter = NBuffers;
 	for (;;)
 	{
+		uint32	state;
 
 		buf = GetBufferDescriptor(ClockSweepTick());
 
@@ -303,11 +308,15 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
 		LockBufHdr(buf);
-		if (buf->refcount == 0)
+
+		state = pg_atomic_read_u32(&buf->state);
+
+		if ((state & BUF_REFCOUNT_MASK) == 0)
 		{
-			if (buf->usage_count > 0)
+			if ((state & BUF_USAGECOUNT_MASK) != 0)
 			{
-				buf->usage_count--;
+				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
+
 				trycounter = NBuffers;
 			}
 			else
@@ -589,6 +598,8 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 {
 	volatile BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		state;
+	uint32		usagecount;
 
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
@@ -617,7 +628,10 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
 	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	state = pg_atomic_read_u32(&buf->state);
+	usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
+	if ((state & BUF_REFCOUNT_MASK) == 0
+		&& usagecount <= 1)
 	{
 		strategy->current_was_in_ring = true;
 		return buf;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3144afe..1e11d71 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -108,6 +108,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	int			b;
 	int			trycounter;
 	bool		found;
+	uint32		state;
 
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
@@ -128,16 +129,25 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
 				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
 #endif
+		state = pg_atomic_read_u32(&bufHdr->state);
+
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			int	usagecount;
+
+			usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
+
+			if (usagecount < BM_MAX_USAGE_COUNT)
+			{
+				state += BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
+			}
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
-		if (bufHdr->flags & BM_VALID)
+		if (state & BM_VALID)
 			*foundPtr = TRUE;
 		else
 		{
@@ -169,9 +179,15 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			int		usagecount;
+
+			state = pg_atomic_read_u32(&bufHdr->state);
+			usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
+
+			if (usagecount > 0)
 			{
-				bufHdr->usage_count--;
+				state -= BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
 				trycounter = NLocBuffer;
 			}
 			else
@@ -193,7 +209,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * this buffer is not referenced but it might still be dirty. if that's
 	 * the case, write it out before reusing it!
 	 */
-	if (bufHdr->flags & BM_DIRTY)
+	if (state & BM_DIRTY)
 	{
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
@@ -211,7 +227,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 				  false);
 
 		/* Mark not-dirty now in case we error out below */
-		bufHdr->flags &= ~BM_DIRTY;
+		state &= ~BM_DIRTY;
+		pg_atomic_write_u32(&bufHdr->state, state);
 
 		pgBufferUsage.local_blks_written++;
 	}
@@ -228,7 +245,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	/*
 	 * Update the hash table: remove old entry, if any, and make new one.
 	 */
-	if (bufHdr->flags & BM_TAG_VALID)
+	if (state & BM_TAG_VALID)
 	{
 		hresult = (LocalBufferLookupEnt *)
 			hash_search(LocalBufHash, (void *) &bufHdr->tag,
@@ -237,7 +254,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			elog(ERROR, "local buffer hash table corrupted");
 		/* mark buffer invalid just in case hash insert fails */
 		CLEAR_BUFFERTAG(bufHdr->tag);
-		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
+		state &= ~(BM_VALID | BM_TAG_VALID);
+		pg_atomic_write_u32(&bufHdr->state, state);
 	}
 
 	hresult = (LocalBufferLookupEnt *)
@@ -250,9 +268,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * it's all ours now.
 	 */
 	bufHdr->tag = newTag;
-	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	state |= BM_TAG_VALID;
+	state &= ~BUF_USAGECOUNT_MASK;
+	state += BUF_USAGECOUNT_ONE;
+	pg_atomic_write_u32(&bufHdr->state, state);
 
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -267,6 +287,7 @@ MarkLocalBufferDirty(Buffer buffer)
 {
 	int			bufid;
 	BufferDesc *bufHdr;
+	uint32		state;
 
 	Assert(BufferIsLocal(buffer));
 
@@ -280,10 +301,13 @@ MarkLocalBufferDirty(Buffer buffer)
 
 	bufHdr = GetLocalBufferDescriptor(bufid);
 
-	if (!(bufHdr->flags & BM_DIRTY))
+	state = pg_atomic_read_u32(&bufHdr->state);
+
+	if (!(state & BM_DIRTY))
 		pgBufferUsage.local_blks_dirtied++;
 
-	bufHdr->flags |= BM_DIRTY;
+	state |= BM_DIRTY;
+	pg_atomic_write_u32(&bufHdr->state, state);
 }
 
 /*
@@ -307,8 +331,11 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
@@ -327,8 +354,9 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
@@ -349,8 +377,11 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
+
+		state = pg_atomic_read_u32(&bufHdr->state);
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
 		{
 			if (LocalRefCount[i] != 0)
@@ -367,8 +398,9 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..92889e6 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -20,29 +20,40 @@
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
+#include "port/atomics.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
 
 
 /*
+ * State is:
+ * 10 bit flags
+ * 4 bit usage count
+ * 18 bit refcount
+ */
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_FLAG_MASK 0xFFC00000U
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+
+/*
  * Flags for buffer descriptors
  *
  * Note: TAG_VALID essentially means that there is a buffer hashtable
  * entry associated with the buffer's tag.
  */
-#define BM_DIRTY				(1 << 0)		/* data needs writing */
-#define BM_VALID				(1 << 1)		/* data is valid */
-#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
-#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
-#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
-#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
-#define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
-#define BM_PERMANENT			(1 << 8)		/* permanent relation (not
+#define BM_LOCKED				(1U << 22)		/* buffer header is locked */
+#define BM_DIRTY				(1U << 23)		/* data needs writing */
+#define BM_VALID				(1U << 24)		/* data is valid */
+#define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
+#define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
+#define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
+#define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
+#define BM_PERMANENT			(1U << 31)		/* permanent relation (not
 												 * unlogged) */
-
-typedef bits16 BufFlags;
-
 /*
  * The maximum allowed value of usage_count represents a tradeoff between
  * accuracy and speed of the clock-sweep buffer management algorithm.  A
@@ -137,12 +148,11 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
-	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
-	slock_t		buf_hdr_lock;	/* protects the above fields */
+	/* state of the tag, containing flags, refcount and usagecount */
+	pg_atomic_uint32 state;
+
+	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
 	int			buf_id;			/* buffer's index number (from 0) */
 	int			freeNext;		/* link in freelist chain */
@@ -192,16 +202,11 @@ typedef union BufferDescPadded
 #define FREENEXT_NOT_IN_LIST	(-2)
 
 /*
- * Macros for acquiring/releasing a shared buffer header's spinlock.
- * Do not apply these to local buffers!
- *
- * Note: as a general coding rule, if you are using these then you probably
- * need to be using a volatile-qualified pointer to the buffer header, to
- * ensure that the compiler doesn't rearrange accesses to the header to
- * occur before or after the spinlock is acquired/released.
+ * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
+ * not apply these to local buffers! FIXUP!
  */
-#define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
-#define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+extern void LockBufHdr(volatile BufferDesc *desc);
+extern void UnlockBufHdr(volatile BufferDesc *desc);
 
 
 /* in buf_init.c */
#11YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#10)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tuesday 15 September 2015 04:06:25 Andres Freund wrote:

And here's an actual implementation of that approach. It's definitely
work-in-progress and could easily be optimized further. Don't have any
big machines to play around with right now tho.

Thanks. Interesting.
We had a version like your patch. But this is only half the work. Example:
state = pg_atomic_read_u32(&buf->state);
if ((state & BUF_REFCOUNT_MASK) == 0
&& (state & BUF_USAGECOUNT_MASK) == 0)
After the first command somebody can change buf->state and local state not
actual.
In this embodiment, there is no significant difference between the two
patches. For honest work will need used the CAS for all IF statement.

Thanks! Hope for understanding. ^_^
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: David Rowley (#3)
Re: Move PinBuffer and UnpinBuffer to atomics

On Saturday 12 September 2015 04:15:43 David Rowley wrote:

I've run this on a single CPU server and don't see any speedup, so I assume
I'm not getting enough contention.
As soon as our 4 socket machine is free I'll try a pgbench run with that.

Excellent! Will wait.

Just for fun, what's the results if you use -M prepared ?

Unfortunately now we can not check. :(

--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Merlin Moncure
mmoncure@gmail.com
In reply to: Andres Freund (#10)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Sep 14, 2015 at 9:06 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-09-14 17:41:42 +0200, Andres Freund wrote:

I pointed out how you can actually make this safely lock-free giving you
the interesting code.

And here's an actual implementation of that approach. It's definitely
work-in-progress and could easily be optimized further. Don't have any
big machines to play around with right now tho.

Are you confident this is faster across all workloads? Pin/Unpin are
probably faster but this comes at a cost of extra atomic ops during
the clock sweep loop. I wonder if this will degrade results under
heavy contention.

Also, I'm curious about your introduction of __builtin_expect()
macros. Did you measure any gain from them? I bet there are other
places they could be used -- for example the mvcc hint bit checks on
xmin.

merlin

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#11)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-15 12:51:24 +0300, YUriy Zhuravlev wrote:

We had a version like your patch. But this is only half the work. Example:
state = pg_atomic_read_u32(&buf->state);
if ((state & BUF_REFCOUNT_MASK) == 0
&& (state & BUF_USAGECOUNT_MASK) == 0)

After the first command somebody can change buf->state and local state not
actual.

No, they can't in a a relevant manner. We hold the buffer header lock.

In this embodiment, there is no significant difference between the two
patches. For honest work will need used the CAS for all IF statement.

What?

Thanks! Hope for understanding. ^_^

There's pretty little understanding left at this point. You're posting
things for review and you seem completely unwilling to actually respond
to points raised.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15Andres Freund
andres@anarazel.de
In reply to: Merlin Moncure (#13)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-15 08:07:57 -0500, Merlin Moncure wrote:

Are you confident this is faster across all workloads?

No. This is a proof of concept I just wrote & posted because I didn't
see the patch moving in the right direction. But I do think it can be
made faster in all relevant workloads.

Pin/Unpin are probably faster but this comes at a cost of extra atomic
ops during the clock sweep loop. I wonder if this will degrade
results under heavy contention.

I think it's actually going to be faster under contention, and the
situation where it's slower is uncontended workloads where you a very
very low cache hit ratio.

Also, I'm curious about your introduction of __builtin_expect()
macros. Did you measure any gain from them?

I introduced them because I was bothered by the generated assembler ;)

But a bit more seriously, I do think there's some benefit in influencing
the code like that. I personally also find they *increase* readability
in cases like this where the likely() branch should be taken just about
all the time.

I bet there are other places they could be used -- for example the
mvcc hint bit checks on xmin.

I don't think those are good candidates, there's too many cases where
it's common to have the majority of cases go the other way.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Merlin Moncure
mmoncure@gmail.com
In reply to: Andres Freund (#15)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Sep 15, 2015 at 9:56 AM, Andres Freund <andres@anarazel.de> wrote:

On 2015-09-15 08:07:57 -0500, Merlin Moncure wrote:

Also, I'm curious about your introduction of __builtin_expect()
macros. Did you measure any gain from them?

I introduced them because I was bothered by the generated assembler ;)

But a bit more seriously, I do think there's some benefit in influencing
the code like that. I personally also find they *increase* readability
in cases like this where the likely() branch should be taken just about
all the time.

right. For posterity, I agree with this.

I bet there are other places they could be used -- for example the
mvcc hint bit checks on xmin.

I don't think those are good candidates, there's too many cases where
it's common to have the majority of cases go the other way.

Maybe, but, consider that penalty vs win is asymmetric. If the hint
bit isn't set, you're doing a lot of other work anyways such that the
branch penalty falls away to noise while if you win the benefits are
significant against the tight tuple scan loop.

Anyways, as it pertains to *this* patch, +1 for adding that feature.

merlin

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#14)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tuesday 15 September 2015 16:50:44 Andres Freund wrote:

No, they can't in a a relevant manner. We hold the buffer header lock.

I'm sorry, I did not notice of a LockBufHdr.

In this embodiment, your approach seems to be very similar to s_lock. Cycle in
PinBuffer behaves like s_lock.
In LockBufHdr:
if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))

conflict with:
while (unlikely(state & BM_LOCKED))
from PinBuffer.
Thus your patch does not remove the problem of competition for PinBuffer.
We will try check your patch this week.

You're posting
things for review and you seem completely unwilling to actually respond
to points raised.

I think we're just talking about different things.
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#17)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-15 19:43:28 +0300, YUriy Zhuravlev wrote:

On Tuesday 15 September 2015 16:50:44 Andres Freund wrote:

No, they can't in a a relevant manner. We hold the buffer header lock.

I'm sorry, I did not notice of a LockBufHdr.

In this embodiment, your approach seems to be very similar to s_lock. Cycle in
PinBuffer behaves like s_lock.

In LockBufHdr:
if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))

conflict with:
while (unlikely(state & BM_LOCKED))
from PinBuffer.
Thus your patch does not remove the problem of competition for PinBuffer.
We will try check your patch this week.

That path is only taken if somebody else has already locked the buffer
(e.g. BufferAlloc()). If you have contention in PinBuffer() your
workload will be mostly cache resident and neither PinBuffer() nor
UnpinBuffer() set BM_LOCKED.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19YUriy Zhuravlev
u.zhuravlev@postgrespro.ru
In reply to: Andres Freund (#18)
Re: Move PinBuffer and UnpinBuffer to atomics

That path is only taken if somebody else has already locked the buffer
(e.g. BufferAlloc()). If you have contention in PinBuffer() your
workload will be mostly cache resident and neither PinBuffer() nor
UnpinBuffer() set BM_LOCKED.

Thanks. Now I understand everything. It might work.
We will be tested.

your workload

Simple pgbench -S for NUMA.

--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20Andres Freund
andres@anarazel.de
In reply to: YUriy Zhuravlev (#19)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-09-15 20:16:10 +0300, YUriy Zhuravlev wrote:

We will be tested.

Did you have a chance to run some benchmarks?

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#20)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Sep 24, 2015 at 6:32 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-09-15 20:16:10 +0300, YUriy Zhuravlev wrote:

We will be tested.

Did you have a chance to run some benchmarks?

Yes, we now have 60 physical cores intel server and we're running
benchmarks on it.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#22Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#21)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Sep 24, 2015 at 6:36 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Thu, Sep 24, 2015 at 6:32 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-09-15 20:16:10 +0300, YUriy Zhuravlev wrote:

We will be tested.

Did you have a chance to run some benchmarks?

Yes, we now have 60 physical cores intel server and we're running
benchmarks on it.

We got a consensus with Andres that we should commit the CAS version first
and look to other optimizations.
Refactored version of atomic state patch is attached. The changes are
following:
1) Macros are used for access refcount and usagecount.
2) likely/unlikely were removed. I think introducing of likely/unlikely
should be a separate patch since it touches portability. Also, I didn't see
any performance effect of this.
3) LockBufHdr returns the state after taking lock. Without using atomic
increments it still can save some loops on skip atomic value reading.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas.patchapplication/octet-stream; name=pinunpin-cas.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 3ae2848..3e70792
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 95,106 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 95,103 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 8c0358e..8992438
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 51,57 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 51,56 ----
*************** static volatile BufferDesc *PinCountWait
*** 126,132 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 125,131 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 774,782 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 773,780 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 788,795 ****
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
--- 786,793 ----
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 807,813 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 805,811 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 885,891 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 883,889 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 939,945 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	volatile BufferDesc *buf;
  	bool		valid;
--- 937,943 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	volatile BufferDesc *buf;
  	bool		valid;
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1001,1024 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 999,1024 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1202,1208 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1202,1208 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1210,1217 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1210,1217 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1232,1243 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
  	UnlockBufHdr(buf);
  
--- 1232,1250 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	pg_atomic_fetch_and_u32(&buf->state,
! 							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
! 							  BM_PERMANENT |
! 							  BUF_USAGECOUNT_MASK));
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID | BM_PERMANENT |
! 							   BUF_USAGECOUNT_ONE);
  	else
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID |
! 							   BUF_USAGECOUNT_ONE);
  
  	UnlockBufHdr(buf);
  
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1267,1273 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1274,1280 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(volatile BufferDesc *bu
*** 1286,1294 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1293,1302 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1310,1316 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1318,1324 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1329,1335 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1337,1343 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1344,1353 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
  
  	UnlockBufHdr(buf);
  
--- 1352,1360 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
  
  	UnlockBufHdr(buf);
  
*************** void
*** 1381,1386 ****
--- 1388,1394 ----
  MarkBufferDirty(Buffer buffer)
  {
  	volatile BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1397,1410 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(bufHdr->content_lock));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1405,1418 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(bufHdr->content_lock));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1412,1418 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
--- 1420,1426 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1454,1460 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1462,1468 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1482,1488 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
--- 1490,1496 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(volatile BufferDesc *buf, Buff
*** 1495,1517 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1503,1542 ----
  
  	if (ref == NULL)
  	{
+ 		uint32 state;
+ 		uint32 oldstate;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(volatile BufferDesc *buf, Buff
*** 1527,1535 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1552,1560 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(volatile BufferDesc *buf, Buff
*** 1540,1546 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1565,1571 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(volatile BufferDesc *bu
*** 1554,1564 ****
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
--- 1579,1589 ----
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	pg_atomic_fetch_add_u32(&buf->state, 1);
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
*************** UnpinBuffer(volatile BufferDesc *buf, bo
*** 1594,1623 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		LockBufHdr(buf);
! 
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1619,1658 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32 state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(volatile BufferDesc *buf, bo
*** 1635,1640 ****
--- 1670,1676 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_to_write;
*************** BufferSync(int flags)
*** 1675,1688 ****
  		volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  			num_to_write++;
  		}
  
--- 1711,1725 ----
  		volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
! 			pg_atomic_fetch_or_u32(&bufHdr->state,
! 								   BM_CHECKPOINT_NEEDED);
  			num_to_write++;
  		}
  
*************** BufferSync(int flags)
*** 1721,1727 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
--- 1758,1764 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2081,2086 ****
--- 2118,2124 ----
  {
  	volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  
  	ReservePrivateRefCountEntry();
  
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2093,2102 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2131,2143 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2104,2110 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2145,2151 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2256,2261 ****
--- 2297,2303 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2273,2284 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2315,2327 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2333,2339 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2376,2382 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2356,2362 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2399,2405 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 2424,2430 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
  
  	/*
--- 2467,2473 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
  	UnlockBufHdr(buf);
  
  	/*
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 2444,2450 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2487,2493 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2532,2544 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2575,2587 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2638,2644 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2681,2687 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2736,2742 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2779,2785 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2778,2784 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2821,2827 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 2874,2880 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 2917,2924 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 2895,2901 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 2939,2945 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 2923,2929 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 2967,2974 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 2975,2981 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 3020,3027 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3086,3104 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3132,3151 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3109,3115 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3156,3162 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3149,3156 ****
  		}
  
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3196,3207 ----
  		}
  
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3170,3176 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
--- 3221,3229 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
! 
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
*************** UnlockBuffers(void)
*** 3208,3216 ****
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
  		UnlockBufHdr(buf);
  
--- 3261,3269 ----
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
  
  		UnlockBufHdr(buf);
  
*************** LockBufferForCleanup(Buffer buffer)
*** 3304,3328 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 3357,3384 ----
  
  	for (;;)
  	{
+ 		int		state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
*************** LockBufferForCleanup(Buffer buffer)
*** 3349,3357 ****
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
--- 3405,3413 ----
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
*************** bool
*** 3393,3414 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	volatile BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3449,3474 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	volatile BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3416,3424 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3476,3486 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(volatile BufferDesc *buf)
*** 3456,3472 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
--- 3518,3534 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
*************** WaitIO(volatile BufferDesc *buf)
*** 3494,3499 ****
--- 3556,3563 ----
  static bool
  StartBufferIO(volatile BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3504,3512 ****
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3568,3576 ----
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3522,3528 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3586,3592 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3530,3536 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
  
  	UnlockBufHdr(buf);
  
--- 3594,3600 ----
  		return false;
  	}
  
! 	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
  
  	UnlockBufHdr(buf);
  
*************** static void
*** 3561,3575 ****
  TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  				  int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
  
--- 3625,3643 ----
  TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  				  int set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
! 
! 	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
! 	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
! 		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
! 
! 	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
  
  	UnlockBufHdr(buf);
  
*************** AbortBufferIO(void)
*** 3594,3599 ****
--- 3662,3668 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3602,3627 ****
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3671,3694 ----
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3645,3651 ****
  {
  	volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3712,3718 ----
  {
  	volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3701,3703 ****
--- 3768,3801 ----
  	else
  		return 0;
  }
+ 
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			pg_spin_delay();
+ 			state = pg_atomic_read_u32(&desc->state);
+ 
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	return state | BM_LOCKED;
+ }
+ 
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index bc2c773..ba5f493
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				volatile BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				volatile BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  volatile BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	volatile BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  volatile BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	volatile BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static volatile BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	volatile BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 594,604 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static volatile BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	volatile BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 626,637 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 3144afe..c62a6f2
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index 521ee1c..5745bfc
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 20,48 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 20,62 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * State is:
+  * 10 bit flags
+  * 4 bit usage count
+  * 18 bit refcount
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 137,148 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint16		usage_count;	/* usage counter for clock sweep code */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
! 	slock_t		buf_hdr_lock;	/* protects the above fields */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
--- 151,161 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
! 	/* state of the tag, containing flags, refcount and usagecount */
! 	pg_atomic_uint32 state;
! 
! 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
*************** typedef union BufferDescPadded
*** 192,207 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
!  *
!  * Note: as a general coding rule, if you are using these then you probably
!  * need to be using a volatile-qualified pointer to the buffer header, to
!  * ensure that the compiler doesn't rearrange accesses to the header to
!  * occur before or after the spinlock is acquired/released.
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /* in buf_init.c */
--- 205,215 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /* in buf_init.c */
*************** extern BufferDesc *LocalBufferDescriptor
*** 216,222 ****
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 volatile BufferDesc *buf);
--- 224,231 ----
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 volatile BufferDesc *buf);
#23Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#22)
3 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Oct 29, 2015 at 8:18 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Thu, Sep 24, 2015 at 6:36 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Thu, Sep 24, 2015 at 6:32 PM, Andres Freund <andres@anarazel.de>
wrote:

On 2015-09-15 20:16:10 +0300, YUriy Zhuravlev wrote:

We will be tested.

Did you have a chance to run some benchmarks?

Yes, we now have 60 physical cores intel server and we're running
benchmarks on it.

We got a consensus with Andres that we should commit the CAS version first
and look to other optimizations.
Refactored version of atomic state patch is attached. The changes are
following:
1) Macros are used for access refcount and usagecount.
2) likely/unlikely were removed. I think introducing of likely/unlikely
should be a separate patch since it touches portability. Also, I didn't see
any performance effect of this.
3) LockBufHdr returns the state after taking lock. Without using atomic
increments it still can save some loops on skip atomic value reading.

pinunpin-cas-original-fix.patch is just original patch by Andres Freund
with fixed bug which causes hang.
Performance comparison on 72-cores Intel server in attached. On this
machine we see no regression in version of patch in previous letter.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-comparison.csvtext/csv; charset=US-ASCII; name=pinunpin-comparison.csvDownload
pinunpin-comparison.pngimage/png; name=pinunpin-comparison.pngDownload
�PNG


IHDR��r)W IDATx^��	�Vs����R��0�����F!Sf�P�dT4� �([&�5d������2�X�uThdM
**��_���\�}�s�}�s�s�y����+�=�������~�����
6� @� ��$3 @� @e�df @� @`#$��@� @@��� @� ���$3# @� @d��� @� 2��@� @�@F,�fb@� @�6@��
� @� $�9@� @�H�L23� @� @&�9@� @ ��� @� d$�rk& @� @`#$��@� @@��� @� ���$3# @� @d��� @� 2��@� @�@F,�fb@� @�6@��
� @� $�9@� @�H�L23� @� @&�9@� @ ��� @� d$�rk& @� @`#$��@� @@��� @� ���$3# @� @d��� @� 2��@� @�@F,�fb@� @�6@��
� @� $�9@� @�H�L23� @� @&�9@� @ ��� @� d$�rk& ��8������?v��]��5j����ic}���_��W������l�m��?��O�i\�]�3�8�v�i';���b�8w��w���#G��7�l[o�u��B�!@ $HrH�i���$��o��s�1��?������'�0I���_n����kr�����Y3;�����B��E��o���M�<�.���� �9�����>�����n;��s�a����I%� �lHr���� ��$���&��������SN�����v��W&x�����_o��/G��Z� BI+�B�@&IV����]w��:�����-Zd����=��#��'��[�N��`o�������~����:{���*F��A���1cl�]v)o�{���j�*{��g���������z��a�Z�j��f��m�o��n��SFV����SO=�e���o�3�W]�r������������_�f�m��#���?��+�<�����~jk�������N8��s�=�sZ���GUx���/����7�k3f���{�q�[���l��.��R��W��y���y��.���W_���������?�sR=K�.u1|��Wm��5��_����1�3����=���n���
�<����W�T������k��w�Y��M��Cu+T�-[f���+��>pQ����������������<Q\��[�V6<�����~m��f���{����S����;m��i�������o;����x����C������6a�{��7��j�������vsu�M7�0��1;v�����n��������/��U��Y��^�z.���~z��^psEq�S�����n�o��W��Y�2|�"II�dX� ��	d�d	�$E��%�*�d���w{c�=	������	�J���p�B�(8���x��N�<��h���S�%��(J�%@�3	I!��DN�\�zuy�
40I�D��3���]����[����n�l������v���;AS=���\3�/��R��������m�}�uR����QG�X�X�d����7i���~�m������n�z�r�I�G	������>}�c$q���{���;;q���c��������'�DT����=�X'��P�Wb���]��4���}�q����z=I�>��N;�I���+��_�q:���\|���b��K7���5�8��=I�C=dO>���q���U+��D[�hn���G��O�>���+}�[�'=������m�$?��S���?���N�W�X��(hN�?*S�Lq1�8k���������n��=����R�?�<@Q"�$G)��@R%�����+C�,��W_�$O%�$+�v�=�X�-�[�HN��J���������+�&	��Q��eFb)�R��+dI[�����LU�������s7�t�����N��*�A�(�^�SB���'YK�%���u��>�^�C{��e�����%�\��Z�T����;1U��V� ���9��l��N�4)�����"+N��R�����s������5�}��,�2��$t�A6l��������1H@S��K�g���_�{_j����(��q���.�2&��BsF�����P.���������7�Ip�\����+�Z�R����$9���*�7 @ ���H��NA���$Y�D��/������sB��}��
2����S���O��1S.WI��VO�z%U���z��dFr�vjQ�S��(����L�T���^��{�_������q�-��]�+�K]R�I�?���U���k�^���!�L?�K��y������kj��o�uY�������J���]��}��.{��UW�����\sM���������"y�ruI���$W\S��++.�U������\S&���������=����2�U��>��	*��n��M������������:	^��e�zG���$��M^]�0E���$��ig��TmP�& D�����!@�H=�Z{_�dV�d�K&��zY=�yO�%2����$g��H�'q����+���?4��Z��e��X����L�T���C������{�}M��h��2������2�(e�k�d���^��zK�3�"S?�9��-+�r��"a�Rf�S��h	�D��o�q�O����TT��X{�������Zz�,�2����}�+�$+S���^�$+;���*U���IB���'������CK��RA�4�tI�4��%Ymh����������m	�b]]��	1���$9����~�y
��BI�J$� �	TupW&1K�l��J���r����r�d�M���+t���Nr��2��
�$Y����9�h���n{�d���L��w�uW'P��j�lM����wL����JUW@�K��S")��E��-����L��Nf�����LE/���[���H@��$w�qn/�_��=��{��������r�d��l����X�VPW�f�y�W��5mO�6��9�C�����L��>��M{��|�d�?�<@!@�C�MS��$�$��c-��>U0���'��k!�<d�w��W�l\��j�����d�d,��S�?;w���=�������36-�M�d�����k�tW������{YSc��$��N���d���l���@�e	~z����9����`���F��IVf]����m��7I��]���A�r�M��o�,PYB,)M��k����B%��>�P�Z��k����Z��y�Z���Y���g��S6��@�$9:��'�r"�$K�9��F�/,��5<7�p��:�I�rn�����N�����e�;�:����T�Lw���	���I�Q���R*:�J{��5-Y�>c	�NlV���I�z%�<���!\�Y}]���ow�}�qc�|�@)�y�V���ZYVe�u5���R�W�RI^u��2�|�?������nU�d����I��>m�WFU��uh��U�TYUe��@�*q��f�2���>�P��p�Ce���������B'V��Z��%��a>��ZV��T�>���J4�YEsC�t]_&��>n-��c��zWI���,� HrlBEG!T$�$�5]}$q�$�(�(���b-GM�'9�����J���U"��=��H��*��	��L�$��p	��S��%�Z�;z�h�%�$YYc-����;e=%S*�WP	��������$��J��Q��WTI�%��3]�%^���UK��$\���up��%WU��b:g���[R���}�:[Yi�A"��������/I�)�Z6�_�T���I��a���k��W�_K�%��H�����]}��J�.����=��S��z���I�od@�%�$'7���@�x�(Uw�t���Jn���A�A���$[����t��6�� �KI��'�A�@52�4��2�a�M�	Tw�5� @�&�$;�@�� �%�j��$3 @ ���(G��AH$9a�s8Hr��x
�B!�$���F @� @ ��8D�>B� @�@(��P0� @� ����(�G@� @��
f� @� �8@��%�@� @�@�C�L#� @� Hr�D!@� @ Hr(�i� @��@I�C��# @� �BI3�@� @�@ �q�}� @� �P ��`�@� @�$9Q��� @� 
$9�4@� @q �$�!J�� @�B!�$���F @� @ ��8D�>B� @�@(��P0� @� ����(�G@� @��
f� @� �8@��%�@� @�@�C�L#� @� Hr�D!@� @ Hr(�i� @��@I�C��# @� �BI3�@� @�@ �q�}� @� �P ��`�@� @�$9Q��� @� 
$9�4@� @q �$�!J�� @�B!�$���F @� @ ��8D�>B� @�@(��P0� @� ����(�G@� @��
f� @� �8@��%�@� @�@�C�L#� @� Hr�D!@� @ Hr(�i� @��@I�C��# @� �BI3�@� @�@ �q�}� @� �P ��`�@� @�$9Q��� @� 
$9�4�D+V��
X�:u�8<��%�5k���
�\��.�����j�*k��q�B`���w�}g�m��m��&� ��@�c8�]|Hr�c� �Q�B���$?Q���H�@�@��g��%NI.�	�q�H2�@�d��GIf.@�?$9�1dE"�$	|��E�#�"uI.�6�$G0(t	�@�����8<H2sA�d��d�@*$��@ ��������\$�kI�X@��2�E�f���.A����#0��d�@*$��@&�9@&�9@ Y��d����H�Lr��#������52�!��xSd�# �@ Hr�x� ��@��d��d����EINV<M���aG�)$9��	�kd�C����$G<@t�@��, ��$3�"�$37�$3�$3 ��u����o�i{���?R�$�$�	�� @&�9 H2�If ���C���^�W_}��8�*��II��AIf �������[3�N`��v�������o��<��~��wl���v�A��G�0L�6�}�Q����I�&v�EY�f�������������+[�v�z���������_n����n����n���v���.WG����@��[�y����~fs�����z�n��fk��U��3�� �!����@���|FD&9j�{IN^L���/9��D`�������OQ����;Z���Tj��C�SO=�����KM�|�1����k�����������	�G}���a��NtW�^�$���ow���wo[�~�-_����j+{���m����dI�Yg�e7�p�m���v��W�v�mgG}�����v�9��^{�UF4�\Hrrc��&�$8&�#�1	T��D�����+&]m}�SE���W��]I��q�������G�gqO?�t'��[����2��?���?��S6c�'����tI����nK�,������~�m���{����s}X�p�]x��E�C��&�$';��.@Hr�pcT5��`�U$9@�1�I�Y�b�����(�z��UJ�$�V�Z���1c�X��M�+S<t�P�q�m���6s�L��a����-���F���{��g���sb�u���+e������'���V�~�M79I����;�(|h4���d���HIn��F�c���$7fU#�1����2��_�T%��w����Ho��f��+���O>Y.��������rK%I�4i�-[��N:��J��$k?��wS �7$�o��W2���	u�E��"�$3<H2s!���d�-~���Kf���Jw�����O>q�c��}������f��'�x�-�V�R�a���w�|[�h��3#�I�i��\\��cHr���c��da��*$9�����H��@�.r��dI�e�]f�}��5j������Nx%�&L�:u��,�i��f����������_|�;�����7�p-9�T��0���$Gnj$�CHr���`�$�$�I;�m!���M�=C�������h���A����
%��@H2�B�d�� �����\� Hr�c��DI.��5�$G, E��\$�lI�`P� �	 �9�qx�d��d��GI.��0�|[�n���j�*�����n�v�
��C�1$9�����%�$�TZ'��[$Reb����^#k�Ysk��ynE�i$9b��;���/{��M�U}�+��.�w}]���t����?h|��@�$9
Q��$�$�2l�wI6'^��T��b��h���K�[4l���m����EIG������������������i���m����t��Z��n�v�u��{!��. ��\$�4Hr�c���*��hT%���o:c��2��e����rm�R���^��r��Jt��AHrjf�����<��T
�6t\�PR?x�xR��5�o��oX���m�?o�y�ysO��}����G�}�F�1�����gZ���=@("$���i:���x����'I�%e��
�vi&n�Y4O$R%B��l�S+��	���}��������V�����{Ad�s�dOS?�H�B��'x�qH� ��W�\�����b��\9�]21J[��V'���N}Ww���	���lp��6�����@��h�I$9y1�gDI��tAN�B�7�����d�P���N�?SV0U(���K_�8�J�����������Mjf��������������f��=���qf���l�����5�����~��g���f�KM��}I�������Nr���&f�!Q�}�n@����/��@9$�� I�d��Y3�r�%9�s���>���l�7�\��A��[v�*��"\H����fl���Cf>�>�/�O� ������(��]u�?������;���]GF�k��@9�y���]w�e�^{mF*����]���n�:�����u����w��Z����}����I+��$��\��.�'-��L�]�k�V�'2�T�����EoA IDAT�J?�I_O=$�{��.{Y�L"�o_�~/cFz�����������~�3N�y��d��3���o?���
��8���B ��#�<�v�u�\^���?�����^92�Lm����;�87n���S�J.~��&I>������N��v����}��
��>��~��G�n����H�o?6T	 ����km�x��+~sE����?�n�g
w�������:*�1,��L�x���f�7��p��uOr�K��&��L[��l��sw�����}k������@��z=9$�����m�m�����gE�?�`}�����.Ei��F�|�M���~UmU~��&I����c�����es��wZ�6m�7��
��-4��@� �Q�F�}�XJ0%�*Z^�+_�r����]�U|_�Ip�??)�T9N�DY���l�2q�����4���`Q[\�����|���Js{�v]��	S�P}��u[Fvi�w�]�n�8� �L��lZ��l�2'3Z�������w���^|�E�Ab�
�/��5j��f��m�'O������`��v�m���6`����o/����^����kg��~zy�����C���h���j�n��������������>��c�����O�>��G����3f�}���V�n]=z����[n��^{���_om��ucT���'����-G�l���������R�^�_~���SN9�:u*��PuIF���7NI�!�bS�Nu���b"~|��c(����T��qN�6�}�Q'�M�4��.���5kV�Uj����`�����v��:v�h]�t)o���n��
�	'��^;�������w���U'��_���?���3f��7��x�
����G���~�1����
2���}���;�V������}M��QXn�E�(IHrI��
:��������u7��|U�?�������4�TAV_���J��5R�q�{e�;fk���[lk�TZ�W�g��u��.��D�}�j4�=!vR�I�WUZw�
_�m�~Xav�X���y��{]	���QM��g�j�Y7g�]���mP�mI��'��$VBu��g;����C%I�L^v�eV�vm������8�FIV����>�-�=��3�O���"��d���/�Z�j9��X��tI��a}_R�|�r4h�M�0�IpjQ�����w<��������*��
s��%�z��{��z����%Kl�m���J�j������?���;�<����Pk,F�k��e�#�J��������N�%���vZN�Tf��?����Z�y�gT)�� �8�{����T�z���.�Z���/d\�]]���\\��[�(.b�8^y����_��<�<}�t7v-���/=����]��^�~}���k�����_~$���Q����{EA����-����r����H{������vwY�(�2���~iW�2�U�o��l��~W'�)�]��Y��O���Jj�����M�L2�R�w�>�[��F�7���Z =n��O2��������@��:�?����z��n�5��lNk.����l��wY�i�����3�8Nx���+�
�!+�WGi;��o3R��)���qc�����,����S��(��-��L���+v�d%�4����r��wv��^�a���>$�*+)	O�deD%�^��I�<!���`�����]����%������={V�X)yK�W�i����X4��+����QEc�~kId���3g��~+�]�r��$Y}y���]_$�_�zuqJg����_����$��K.q���N-�_le�U�Q~��W���� �~P���$�$�^�up���.v��i���_��-�io��ng�E����z$�Z~�zuM���(��{�>S�A�+�~� �>����JR�1K��� ���deG��~`�}�=��bs����9���F�-~Z���Se:�� ���TZ�����Go��N���d]�46���j}2�l��������oJ?W������*�I��(� ��0c���)S��&�l���L��E+���Z����]���K��L��%��I���x���\&WEK|�����sm������sO��o���e6��8]������+������3g���*4eY%��F��0fe�|�A��Y}Y�h����W�X��xj��Wb�Z�2�b�Z�2��~���.k,���x�$+{���2��`BK�%����U�Vn������~w�b�-��uhY�����}��.�\�$��O<���>��s2�����U�����r\��\t����L��Y�l��X:�������E�(IHri���)����F_��6l����x���:�Y�����U��O=�a���������c���,��G�)�bK�_�R���L����+�����[�(���C��N#��SWWk�������)����c�g��	T|��;�_M�'���t�����~)�����,�y('In��[V��^e�pe������n��3�����k
�J@�,��=�j�l�Lr�=���o��I���Q�z��N2=I����S�$k�����;�e�3e���8�$Y��Z$��:�g���������M�:I����Km���n���������Izu���k�R%YK�/��R�i���e��n����KrU���+V��K�����'+Ir��X�$k�����/�p���Fz��V'��>T��1k����L�,�ZA�)�\����gI��"u�$$�t��*���g������	f���S_�
�6������!eP:���ws�����be��������z
����4*�,)�2��������fg%)���Ve~�u^��%�]Y��g���Lr[�����x(���O8U�3�w/�
o[��=�,��w�&SN7{�!���q����cR����#�E��$N����TFT�����?g��\��,^����?~���������l���YI��_���+��b-��e����$��(eF����)c���\W)������L���,��U��>t����v��-D�S�Wr�}���|���:�W�6�5F��<v�X��[Z���l%Y ��	e��WYZ4��I�E����b����|��[�.N�c�=���U'�&�VvXl������$k���m���\���:�����v����#���3�Y�@�K#���;z�h��;��.?���3m��t�������Dyv���gm��N�=k�Ye����q{�d���F���,�e�]{�����dM'}{���=����x��W���� �>c)���%y�f�u);�����2����oW���-���k
�H@vIr���r�^�zUy�uu�>-U��H��)�>W-��&���{����X����<�:[IV��Y�������v	���8��XvI�����+������e�E�m)��t����{���{�&IVf]Z�w���e����V8���qJ&u�����%����W�$+���+���=����w���}x��uk7����E��[��_R�}I��:b�w`���:a[������A��SO=I��_��4 ���s�k�v:�����o<�VbT.�B"Q~���_�;����3�����J��{����� �e�.J~b�'9�a�V������K���e��%�f�sROw%�x1}�J@r����*�/]%Kr�!T� BYV}��7�$\{�%�Q/q����>��_d	 ��
�/�tM�����{�����'�P%I����C�EYW��}kl�����ux�����$9'\�~��$��e���}�k���A����*�S!��K]\�Af���A��=������d*����3}@��v�K���$G}����@�#��;���Z�R~M�2�����P���I���f/����k�����,���wY��g.4{el���������~v���B�K,����$�9o���y}���PnG��QpB ���p��}���p,w(����H���P��8D�>B HrP#Pe�k�t �~�������p�U�L�7����������]�.�y���l�{��v��y{����+-����7��1��K�%^	�\� e��$Y���x�_��J(-���k��z=V��pB�@Q	�I.*~�3$9����wO�uM��.�����Y��e�>KvSJ����2�������#:LK�_��Z'ikI�J���Z������n����a�b�F�$	|��(�d�����[l[��+�"D��<$����%�$�K.z�UyM�7������(�������ie��?1�����L���_�5	r��^T��#$9^�
��%Y
>>��J(}8���
WB�o��c�
@Y@��F���HIN���R��y���<���+38+I���O�y-�wY6Z'������\������{]`�w�]������������
OF$�;
$�����z���k������Hh��^�r�z���@Q!�$G%�#v�����R�S��=�U��^I��\�(�[���/�ry�����������	z���2�:<��;$�w����JI���+��A��3�Ox�wEWB�6�tH $9�AeH�@���T+�^�T�A]Ir�(KX�Q���9}���{�_93l����BF�k�����+5����X/�*�H7V�$Ws%��+�
9� �`� �@�c,�-Hr���Ko���*X�ke	���F���LvMi��dY{�UREZ��1��T��Zc�!]���Z*N9�)���Hr���B���~��4{d@����+����~�uX���@��@��Y�f��zI.y��=$9�!��x�;,K2�;J���'u[�pP�/�\�(����x��+m���.K���*� ��de�=y.��'��B�c�N��NIql�Q�5�WB��������9�t�5q$p�����Gi���kQ��������k#G�,J�~5��g���?�h�m�]�U"�~�\�[jN8$9~���,��M����o��^����`����_.��n������*�W��yT�xk���#���[��J�S?�K�J���d
�H`���������~���t��~��>��v�e����W�w�y��i��~��� �~A��$9G`<����P� k(Y���$��jD����{���>!��Q��#��u����Hr��c��������WB�K='�tW����yvl�c}�UA�?�w�|��������e�������5�����w���^|�E���
4����/��Q#�={�M�<�B�����v�m����������������z�jk����~���m���>��.��E:t�m���6o�<������k�u���C�V[me��}�����O���GF���3�>��S�[���=��}�-��k��f�����m��1��S��x�	{���m��u��f����[o�u���lU}����~������ovu6l��1S?y�{����n����!C\_v�}w�1c��Z����~{�5�G)��\?�.aHr|�����>_���|�pPW ���s�+�WfY��+���Y����''�:��6��$��+��������]m����vm��!^��SDv�t,(k���q�T���;h�AV�N�J�r�!v��';�U������SN9�:t�PI�%��]v���]�n��v'~�Q���d����v���<�L������P(������V�ZNl%�z>]��:�,�}�����m��A6a�'��EmhL�C����$������s��6�-'����{��{��z����%Kl�m��������Lu��+���~��_�g��O���\���P�h�����������k���~��_8��� �����&�$�#��=����.#+���A]�Ir�\�x��d]�����xD'9�D���BG��$��j��������A�������(}~��w3�+�UV�q�������?��/�,�l*���}{'p�8{K�_x�{��W��.�J�~�a'�*7�x������>��>���~�{Z^���$<]��QVf�+�d	e���+�m��v��W�:�+���s��(��N�{���2��Ju}M���o���$��K.q�t����k�_�qe�U�Q~��WcJa�����v	@������e����D9�{��A�-���Or64C{I
u��I�5e��V�w��[k��
WBE>��tp�[cCi'��!�dlW�:e��d�M��������.��R&Y����]k)�����Zr-I�����z�erUn��&�����sm������sO��o���l�>�h%I�������|��J8g����b�h�������5����)����>p}Y�h����W�X��3g�[�-qN/������3���v��q��m��_��L�5�$'(�%\Hr��sm�d�#�k�*�����}�������)�Q|�d����eW8�d�w���1fC���2�\��G��{I��e�k�Y�3\	��!�&���t�5Q" ��	���[e|�7��#��I��]�O��7�x�&M�T.�6l�C=������:uj���%�w���p���s��u��L�do�W�'��Z���T��sz&��;�p{��Z
��M�6u����K������Rm�?N-����Z]������E��I��q��L��$�1j�9��H�!c'RY�$���
%5��k�~�<K ����^��*�de�59������?�����$k���P?���}=&�pWB��d��L�@TH`?�p�G��o�u�����?g��\��-^����?~����e����l�������$Y�����+��b��,s�=��J��:����j���R�Xr�k���U�Z�T&Y������e��&�W�w����;�{�5i��e����UW]��n	���c�0S��M����Rq����_�����J� "*�%��@��9�]tHr�C�� ���Jo40IN��C�Zw)�����ew �A��DD�#�Ht"/I�p%��s���f_e�6w��)�
����%�Z2��W�*O��.�����:��N;�d�����[�-��N���c�g�.���U�n��$���6��/-��!X�v�\�?5N���$���he��a����iI����g	|�����V���f�[����G��-Sy�vp�������������$�C�$���MI�^|k� ���Rf��
T��E�C���2A�D���P�#yI���n��[�S���-}��u��n?���Z����#�e��/\�"��uQ����8�5�,���6q�K$9Z��J�S������'-�.���\�������@����%9��P���A����&������@�~������8��si/N}�e\��,�\
Qf��@���W���X���J�t��������l��c���O/�"�jN����@.0^A��$E6~��-��w%���
-�~���g���fX�{���#��'���}H���YE�I$9j�?�!�$G#T�L6I�J�{�����������~��$���RG`��������$Y����:�
��CLWB�����U�����D�!D�@�J�_|a_|�;�.�^3
�M���sO�������:���w_w�Hr��@V��nzup4�\�y� �Q�B4�P�$g�J������\	��@���$���[v��7�S���kWI�uJ��������3��K�[�h�$��F6CD����3Y��u��I.�q�I�S���k����e�J�x)�\���`�E��J�@IJ��l�m��;��Q�F�$�[�n�����:�|2�^4]�`��
��v����0�'�#��W�vm��L`�GSl��#\�#�a�v����?���y���������;�{O��Dw$j.PJ������
4(]��X�z�m���������Y�[�t��;�
��y+����!^SzMq�yQ�M��/D;>��(II���s�����Z���=�\��mu��1]x>j�(��Z���.�^�j3�D	H���Iw<��Sv����F/�t������/�
�=�j�����2���zX UQEs�R��A�����n������I��u�w��ZH�7��;�.�_�}mM�I���/�7�~�Njw���mJ�P �x@�3H�.2�rle���o?{��\�Y�����K,���\���,��i�N�t��5�l���]��{�����Jo���5��$`�uI�9�A���Z-e�j����oJ?��i}�v%���+��e�W~\�� IDATo�m:e5~� �Hr�<�|�>�����v��'���_o���$'a��3$�����_u_�~y��*�TA��.$9����sHri�����&�j�;S!�J��3����L������^�]�=��CU����+���j|m���6M����?�3�H*$�
In���
0����s���#�Rk$9�?
��I��Y�oDU�52��F5Y�!���g!��U����+�$�=&�pWB��X�7y����]�b����BJ�z
�m���U�X���_Z1��}q.�4�BQ%�$W!�M�6��n��f��a��
�.]� �Q��E���(2���C+Hr�N}���g��v]s����?�+��Qj��E��K�%��T���00���N�0�_V�g��*�j?��l��3��MI�F���Z�tM�0��A&���5Z�#���#���$?����H�O�%Y]����p����AH�3	n�?ZsY�l�y�f{9Y�r�6M��lS D�@IKrT�B��AI6NY��w�l�?C���>j�[;�R;��H��@$y��f7�^��Sg�5����'��g-_���%�y
�����u�?>��� ��N@ Hr�i"������_<zN���Q�a|�����+����� �"�$7�T3��h��@$Y]~��Wn5k������D�{��?urv������U�{����I��@8��p8�J	 ��Ub�S��I�� #����8��$�1j��90IN����fmfA��L��qi~�V���q��_��f���_���_k�-��[���mD�
�3}_wzeC:)�~^��GI.{Z�9$���M��d��@\kD��9���$��������Qp���j}M���'?����*��o������Rs�O���@i�d��$�$�	����(�H��s ��!�q���}T��U�J��/0����;�*��2���To������$�+�^�Hz�o�"�/m�O��c����[u�nn�z�2q��Y_�@(��\<^-mH���K��Zv�k�bO�� �5!�q���}\������z������eL�|�\��Y�=�w��:I�B��mO�ycR�Zc����Vu����Z}u�L�F�.�/��$�dB�@�&�$�C4U��7ln{M�����kP��E�� ���K��Hr��Y�X�du������fs�R�Y�x��B�
�]e�%������kZ���ko�Q��LtJf;��R/ +Hr��Eg�DI�'g�8��:��k�N��xwf�)�d@���q�I�{��(��z%�N���+{�
��q)s)eR=a�rm��'��e���k�����(@����&@ $�i�<	 �y�Kym���6y���
2�\�HJ
HrR"Y�8B����Y�5�/�N���������|a�%KI.��3�B	 ��L� #����$��$')���I.�_��._���aE�
C B����������"�Hr�s io"�I�h��A��g�798���"9$:�LI��O@ #$9���� k9�#���j�=���fOr~� io!�I�h��A��g�7k:8l��;�c70: ���II����F��p/��9��j�W�J�z���!e_�� ��Hr�� �o �I�j~cB����������'�'j��'�$'?��0 Hrn`u@���*#����;��� #����$?�$'9���
I��OC�"$9�Q�O� �$g��
2���H��Hr�#������Y�$ ��@���yHrv!J� #����Rx
I.�(g7F$9;N<@ ���(G��E��\sx�.�Hr�s�T�@�K%�5�I��O@�:$9���%�$W�����fD6�Uu���b�@:�$�5��"���� P�����@������^<�M���@;��+?�z�uLY�B���J�kHr�Z�p����* ��@�#�?Hr���_:�	���+����md���d$9~?�A�I�l��E��3z@ �����@�����JM���<x������9$$9Op�@ B�������\1^�(�Hr�~f��-�$�x��$�+^��@&H2�y@�W���$��������5�!!�y��5@"�$G(t%^���x�
r�V�lL�1����f��-�z���4C9�+^?�A�I�l��E��3z@ �����@��ds�s�����\dm����=�[�z�*}|��N�N� kHHr�?@	{
INX@�\<^� HrDA7�G��%Y��S��I.UAF���sT�������^$9~1����$3 ��R�d��ID&����W�$�<�>vI�&UA(2�EO��'P��� W��Hr������d��$'#��(mHri���@�%9'A������f��@9��"���Q=D����6��x��^B������@�JM�S�a���l�g�>�K�|�T�f��I7>�!���U�=E�������x���B�DIf^@ O�&��g
��&�Y�X�m��"9��d@���J�kHr�Z�p����* ��@�#�?�$�r��I���n=F����:��x��^CH%�$3 �'�R�d��	�$����������� ���U@!�$G$t#~JA����%�\3�RxI.�(g7F$9;N<@ ���(G��E�@�%�ZA^���� ����� �OJ$9�?��uI
u�B�#":@�FHr��x�	$Y�G�6�������n���v�	���C��[��,(H2C���<� ��@�'�$�?���H�*�:�ZYd��]FZ��z#���1$�H?�kI�X@��$���i��O�d�@RM�H�$#���c$9wfI|INbT���7�� %Hr��A_bE i�� �7�����%�-$9i�<Hr��x�@T �Q����$Ir���l7�#n3k�{��T�������^$9^�
��Hr�t��@8��p8�J	$E�s������<���HHr����&���h6$�0~�
@ 
��(D�>��@$y���6h� �����md��?�"�ke����I������F�}G�
������C('�$3 �'��K����� �X�A�s�5$�x	zINP0
�\ @^� Hr�@�I ��� �7��d�X��&$9�����H��<�
�@1 ���N�� WIF���~H��<�Z�����o$���@ lHr��i/1�(����I��ikD���`��$��Z!�II�6m%�@�$9U����fc���)��5��P3��!]9�S$9'\�}INlhs��32^� 9Hr�BB��B N�����N�k�V.�6M������Q�Fe�����\����$'&�I.!@(:$��!�q%I� �ke�d�g��?�8��$�1j��I�+�B��&m�J�8H2���C��g���8D)�>"��p�@A@���K��&uI�Z�[w5���Y�����$E6^�"���W��E���K���!�$���VH ���� w8���[����$��:�-!�Q�N�}C���Mk�� �$A�:K�@�%���~nr�z
��>�f>�A�e�"��`�}%Hr�C���d�PR ��@������N ��<|�p��`�d�b��i�2���X'\���������
��~��e��M7$90���I�U��,�(^*� 
$9�4�DQ��R��V�3����`�/[]>�7�kC��
��} �Ik�*E�c��:�$���!�FI
5
%�@�$�TYr<n��6n�B����h���>`g���b'�*�w�2��2��������$��-�o!�I�*c�J��\jg����$�� +[|����9���t���~[Z��-����{�m�co��5�-��2����T�+B�c>_;�$���� ��\�4�Q��*������l��f	����G������+���N����g*�0K���*#�I�i.|Hr��R���H2@��	 ��}�^�(H������o�]�tc�������$A~|H��'D�'��������.��
�Q�:�c�fnYu�s�UF��1J��HrbBY�@���R ��@��:W��d�`�,���.#��N�'��O|�s7sa�a\�ce�u��O�Z��*#��F ��#���k>�B����;��EI�V<�M�S��.��o�����J�o\�t�+��$����$''���I.� �C(>$��1�1%P,IN� Wu��]Z���~?��H��Q�g}Hr<�D��� �R' �p	 �����(�$g%�]a�y�~����a\7<���O������T]�a\~���2��g$�[�����s$�o��@ |Hr��i1!�������f�������*�7����
��}��q�5�|��H�_�w=Hr���g��d?iR ��@����V@ LI��t�����Q��co�ud��S�c"��Q]�4q�g�r��I�5��0.?�S�Ye$�O���I�o���9��7Q�� >$9|���aI�y��A�b��X��� .�%QV��
'���kj��UF����z��x����#�~��.@�!�$�;�&�@��A����{a�g�-��~�~�B�Z6Ye$���yINN,	�\(A�� P|Hr�c@bJ hI�� ?7w��=sa���N����k�86Q�)��$�&��vIo�*G�c.:@ #$���<	)�U
�+��=sAY�#�9�a\'v���vl�a\y�6�kUe��d?)��.$9�����H��D��@������bB%�����U7�}L�����AA���������k�q�u�LY�Q�����oj
4H�D~����%�-$9�QeL�@�H�$���v�UW����m��7�a������Z)����]�����_�&�lb����=z��N>�d[�n��s�=����������}���.���R�'�7� $Y�s��.�r��ml|����^��
��������K��x�_4���q������r��u�����?��sb���j&�$���T�@�K%��H2�DI��W^i��5���;���}9r��}������V��\���{���t�R<x��z����iS'��3�<���kW��3�<cO<���h�IN�ODc�[��$��|����I����m����[�@1���g�g��=�����h�I�F��$9
Q��
#�I��a�y����#��K��#������;W�t�������g�a���O:�$��_�j��7w���[7��������*O����>�`��
���RJ���?�h�j�r�ZV�[i��t�}������;�-��b
�6���Y��u���y�����B�*��'�Yb����^����z�c��vn�q)�3{�^�p��w�6p����1o$��j��
�'�<(�	�2����sH%Q�*H�$k����C��(�]w�e�J��S��Y����O<��������!C�#���/���=�\�L�:ul���6j�(W��/�XA��=��&�U�VY�����)�(�|�s'��������lwx�[b]��������ol?~�mhsh!����2������8}�iy��������w���7����?�3�����F���]����bN@�/X�z�5l�0�#���X�r�;����7��/>�@�c@ �/��H�_|a�\r�[^����������_���e�]��$���.����;�K�������ov������w�F��I�&��K/��:������Xn]�������j�l���Uu���l=�mS�r�D�[w��[����%��O��4�r�����a�u�C@ L 1���7���Q?����Pn��6'�G}tPg�}������I��ee��O�V�\&Y�����Y����z�������\r*�C��M���jX��M�5��7l��!]Ed�0��s>/�>;li�:�L�a\��LI��������u��.���Y���{�#�$�/fA�I�,�B�@b$Y{����c��w_�r7�B}�!�X�.?-��A]:�+uY��q�l�-�p{�=I�2��g�{�9��f-�&����zK�J��Y�m���N�u�u��m�*�:����?4���>[:9.�����s��W.���}���l2�� ����F�$3 ��@b$Y�����\�����wb-��UN�t�M7�����(������o��N;�=d:�ZK���s�rI�I�zg���*)�6��	��
��(	��9������Ee��i�q�]���UK�(��$�C���l_,[mS����5���R�7Hr�E���"��@�'�(I�a�'����s�de�;v��Dx��Av�WX�����i����Ut�w���I�$k����&Lpx!����~� _I�� k���7>�q3V8�KW8i�p�u�D���<I����6n�B�\':�UI.��g6��\� ��������@>�<���v���m���k��dN����I�5���e��G���$/\��������K{D�t$Hr8���
��(�G@�@��!��@������"���2�z�����C����y
�k-����ub��ni5%�$k[G������;�������M$9�a��Hr X��@���Pq�X��"�����:������i2�J���������������&��8D)�>"��p�@A@���K��&��$C�u���Pa�q���lP��9���Y�*�Z��u�t�\��8�!�q�R8}D���L+��$�$I��M I����
�kW��]��{��k�17�c�8��r9�~c]��a\�M�TIV+������;��_{�y�=r5#��I�:�$
=
C����J**55Ir� ������tn ����
�`��]R~�.�hl:����������-�>��6v�;�Z�$9a�D'��H��N@(��\>^.e�Ir����:�+�0����u�a�RM�cO�de����Y��r�uZ���h�!�EC�������A����32^�@��$YK��N�k�V.�N�t�����k���K���M���k�Z6i@xB&�.�j���^7��%�!���!�E��������@����4^�@U�,A�de��4mc�{�5z�h_�y�tW���V�q�d&IVv����v�GN�\���tX���HG�$9�1���j"�$�D��C�
��� �;�Kw��Ut��p�~c��zf���%�3�����S�=@�G�������B������@��%Y���g[���m���
� {�q��7�I���g�z-�$��a��m��_rX;w�%���d�7��!����Y@�$�$G3.�*R%y�������kh���dm��k�3�5N������NI��j�����J��S��c��7��g����+� ����}EHr�C� ��$�@�<I����Y���|�a\y����U%����#���5��%���P8m#��p�C+Hr�D!TOIf�@ O������z���2���������j�F������.���8����Zu��-�V</��.���^���aG�)$9��{�� �$g�G ����O��sF�o���"�����^b���}rn���t�S�v� �1�n�I������wp�����@���\G�$�J��!D��������c�C�G�ox�C����������������YFM3�N��|�����e���d�O�$������3*$9j�@ Z��h�����@� ������n�:���s��������c���X�$�u�\�^}:��k����:�SHrL@����R% ��	 �!��x��t���'��|��G����Y���2������f�v�8@�?>g�;��j�;�,�=��\�z_�$�]��2�%����K{$c���$�I�@����OI�AHR������/�X���J�����I�$�N��e�!�,�fj�d��[��V IDATr}���p$���
$��#�$����-@9@�s����K ]�G~����kd��9�v���/�2�R�5����q3���Xr�����INnls��+1�� =Hr�bB�"F@K��N�k�V.�N�t��k8A��i#���O�&-:d���Xd-�� O8�3'WG,�~t'I�_]GOw��y^7���#V����;Hr��4 � �>���d� k�2�m������Z��'���~����u�:u�T|�O���e�%H���F�5��o|��}��q`[���F�$'3���
I���@�$9Z��7"P� ������
T�d	�N�Vaym�PW��do��>4�����
���\,��kI�^L� �\	 �����!������y��6��v�d��+VT�d�;��=��6���K�W�4[I���=.{�a�)��,H��A���BF�$B�w!D���8���>k��>�����z����N�tHW�$sr��Rw��du���^���.a�uH�	�$9L��nI�v|� �l ��P���"�� �'����`��?�^���@.��R6�\$�[������_	�J������6��!���y@�#�$G/&���F�6���@�d�{��uV���;���b7��$�.��y~wk��A��O�>@�}��j���!@%OI.�)���W+��2����{�����<��/m������5��\��(I�a��m��|n���u����%k�Hr��Y�h��B��. �h@��zQd�
��@>���m�����\����\%������,�.v��mI��g�kC��=�@����L(y�
r����s����T\��(WI��GLs�h_�d�Hr�c���d�HR ��@�����#@`������t'��z-���_�v�u�������:u�D`$t�X��do���.��%������#$�G�1�
I�y�> 2���R&�*���nc#?�]� ���|�>��tOr)�,���#�:	������t�%������� �~��@�#@&�x�i��r�Lw ���\���t	�#��n�����e�Yr]����4��'�x��$�;~��� ����#���}���E+Y�M6����2��+{R�@������[:fHr�M���W��:e������tli��k��@�c@��$��� ��\$�4[d�AV&��&������F��?����:%A��+U���$'�Qk5_I�	���y�����=�6,��#$9G`	~INpp P2���	5�E�%0����}�l���Id�M-H2sJ��d�{��/��/��;��h=vm�@�c<���$��� ��\�4Y� �^<����o.�2�,AVY�d	��S:g��	I.N��j!�<n�B��p,��ZTs���;����$'5��(%Hr)E���:|�p�}�
kmb�?����n�q�u���KkW��Hr	O���"�Z��u�t7�f�����c<������#�>�:@E �$:M�K A�)�*�d���p���
�d��[r}M����c���~�@If�x�d� ��@��CFP
������=Pc9��Kkg��n_#W$�FD%�@�����p��v�����x@��� z�$A�:!�KI�7��H@���EV��R���v�%���@���Hr6���L�����{\���S����?Hr�bT�����R/ �� �����	d#�����MW��l(%��B%Y�N��u{n�c�u|������s$�o��@ |Hr��i1`�
ruw g�E$9J��I��\�k�����_��%p�Hr������<�� �@�#�R8��K���z����]a���JK���9�� ��PJ�3~H�V5t=�V�Yo3��^�N��S�������� �~��@�#�$�=-�L U�{�Xe#W��(�����M���l(%�?$Y��=��M���e{p\���k�Hr��do�� �R7 �p ��p���	d#�����Mw��l(%��$����)��1�\�s� ���[�F���J���%�$���� ���}���E+Y��kl��5�2���O5����tI��R���K�Ej����k�K�,S�CI�O���)�4a�� <$9x�� 	�� +��f�:��{k������^�j>w g�e$9J��OI��\���]��]��%h�Hr��Y�P���: �@�#���l9�;�����
��?��$�P�C�����^��@����{�$M��!OI�1-D@���g[���������A�i��W<���N�y�-}�	��+��V��$�N��b�j�\�lF �1X��E��K��B"�$��f�%0|�p�}�
���/]emL)_b�.�N�7���H��1�km~K�_���q���w>�\��o$9n��Hrpl��@X���H��o�d-WM��Id-]
� �AP�_�~K����q�����=��D{�$�h�3If.@�?$9�1,��~m�=0����tA�pJg�hU������^�%Y�?���l�����v��6���-�\��G�	< �D@��dJ���EV�q�*��r���+�.=�]���> ���g��*I7s��Tv��+�uL<�;$9wfI}�LrR#�� �R"�$�R�c<�TA��{�}��U
�����2R$9��o$I�|�j�:z���g�y���'��c�A$9A
��HrH�i�@����R�~}��l����.��W<]rX;�u{���&$94��n(I���%���ko};��4:g�$3<H2s�@�	 ���a�G0y�#6��Q���i�N(� yr6P��l(%���$��Bp`�mL�Q�MI�v|���&m�� $9���#�A�����/��uZZ�O[w g�}$9J�&(I�<���g@�r�!t��R�#D��g����D�~B������4�E+Y�I=�i���}�m�����I'Y7�_���9(Hr6���LP�,r'���=7w���:��I�~���!�i�� $98������N����?�s����@���Hr6���L���-�n���M=s����������#�>�:@E �$:MfG`������l���n��p<��n��.�hlA���M��l(%�� %Y������b�z�y~wk��A���t�HrL@����R% ��	 �!���	L~g�
�z��z�}��:���v���E���e��R��m�co��9�[���'9fA�
I�j<�D��7z
@ ���|�,�~v�����s6��K��j-�4�Y�w�L������	Z��}o��{�fUq6�oa��F%�
6�1"`�Y{����b��
��BG�(�MT�]��H���?��w���{O������Q�sf�����}����z���,��&s�II6�.QD�$GA�>!�KI��'��D ��<�y)���������%���IC�}*���-����;_��\�{��~2�y�d�jUDHrT�����d�X����M�D���Z.�^I���o'k�E6�$���[�T��Xr�����?�����
Iv���f�$'K��!�GI6�&��H7�j5��������'�.r�@Hr��� I�#��~t��!Q��g�{�
4��%YU3w��u���WCI�%] ���0!�BIfxG`�G�dP�X9~�N���>Y����;�"��
�H
C�51��z��-�����=���YU+IV�L�Yk��@�����;�d�+L~�@ �q��e9f�=^��I���r��+��A{{G>�v!��U$�x��d�M��u���
���
�I�?���-]�9%�����-'�w4`-�I��`��$��!�DI	4����������K���H���2{]-y�����Y��5�]Hr��
�*,I�AI����5^7��Y)��*��wO������+�$�9B�nI��~D@@	8%�����A�I^^���][���#��5+���}���9R��Y#�����z���k���c�5j�n�m��Y:v�(-[���n����~/���
y�y{My��~��T4m$9�A`Y�6Jry?��<��"oWm]����{�9�<���$9�e/6i$��@�~NI�<�	��W^)s��������J�*�UJ%��������������$W�XQz��!M���.�����L�6M4h�$4��7�K��l��sd������$T|��uU��$K�n��^�#3�Y�U�%�%T$����>��$��� D@�I��k��k�N&L�P(�w�y��y��r������~��g�	'���*���v��Z�Jz��Y����{K�-d���I��:s�C��Y���K^�����g
��}z���F�T4�-[�H����U��K@����^-�In��%�V/�v�����jb���(�PPP ��m�j��E�=}D@6���*T0(*B��@�J����>!8#��W��^�z��q�
�<��3����-�.z�o�^:w�,��������s����K%���o��}����>+����b�
���{�v�{���$y��M>��&�_���"�e��m�����/��]O�=���pv���	rFF��1X�tQ�8|X2���2����i�/��X�N9P.?�a��-�A?(��	�RlA�Q�����0h��Y��Q" Pg$y��e��oyu�;v������:qm�����.�H:u�$�^���~���$8P�qoF���O���V�ZR�N�5k�����������
�6-���c����;��ty�uq��s	��{�}l�[[��7��YnX�� "g$Y7������?���a�<�����L?�?������_.��W%X?�UqNH�����!�}����K2d�|��Hr@����g��-K��/
��y��=�	IhX�l�$9Q�7�����Sx�T��
=Y�M��x!�q�z�9#��@�pF�u��.�3f�$���.���q���w��%�\����[��s��Z�e��	I�����K�
�f��!��.�f&��A_t���Vt�W�n��OH�������J��M����"y���^U�ul�	;n�����/��X� `?��%y��9�qM
��{m������G{�g��[�������8���.����u�����<��cr���{_�?�����o���n����K5jT(�*�����3���T����*�����Y���D���)�`��OHrj5v��8Kr������/��_�Y���:��K��r!�q�t�y"�e3�@��\�U@��i#g�uV!+��5j�G���3��e�DK�IV���d�n����N����r���{"�;�>����g%W�Z��1n����b&Y%Y�\�l������Hr0�����e��e�����O3��'$9�1`s�H��Wo�%��'5�^��%�H�����I��'�A��@����6����^_~������������O�����#0�����!�d�/2o�]�aWu����V�R��]���T2B�w�V��;�m*����6����7�A���+YA�"�$���H��~�y����>9���
g���'���+d�����LL��}�$��s�\���^���E������I��L��$����
��^a-��%���n�
[w���
�a�������EI���5 ����@!�H rI��������������Nn��&��%}�X���|���ke�����1y+>���;K���Z�G�������dk�cD��)��$�n��K�������;�{�9Lr[7Jz�HB���K�n��k7{���wy����Hry*�{��x��,!�	D.�~���s�=R�^=Y�r��q��`E�_1b�����]"S�~-��W�Q�������e@��)��$�
M�`����^
�T��h��^���Jp�����~m���)��gIV�J�n�o����B��q�I�K��p�@���p�.]*����C9D����x�������y���E�Cn�o���Z����W��g��Y�-z��-�lK���I.?�=�`���@� �u�&���E;������*��#���5�B�0B�������KAA�nGA�����~8X��-�m�%�~7��c��VIf+$9�q0��E2��y�K���u�I�:��Y����w������'�<�J[g�u)v:��=��E���'�@�$�$��~�������
z��2`��#���%�3�����I:l>\F~�Iz�9�{W��I��Z���$��Vgv�Y��P�Sk%���<�JW��,gU��R�HrJ��|Iv��$��@��<p�@Y�~��a����c�I����w��1+E|����e���o�������]�Xs�S�*!����e�$�;�\��$�;��I��(�A�@��|����w��� ��j�*����L�6-|�
�~��"S^(���+�����'$9�!bU'H�U�*6X]��;p��f���T�`#����2@��"I;��#�$ggg��)S�f���������K�Z��#C���
�Z�o��}��e��c�:�	IdXX�(�lu�v���k���/e�������[��.�B��"��#���5�B��\��8�O��V�ZHYg�u���~���?�\���U�Qfc��E}�����@^^4��c���
�r��$��E�=��<��"oc1}G9�uco���.$��*�0`�� #$�<y��1�<�q���G#����]s�L��b��}B�
`��$G=�.�\����zL]I�`#�!��.�dK
E��J!�$�������o����=x������R��Y>�N�������68��Cc�.kK�k�H��8�k��K��d�JY@Hrd����o"����zKN9��T)�c7|#AC��1�����Rr~�OF�p���>��$>d��I��LiY�l������
��B��F�LH�3�$@ �"�d}'y���R�v������������D��"y;���'$����l�Hr����_���k�72���^�[7��
���d]��l�f�/��%�n(���AuK\�m/a7"G���#Y@�&�$����e?��k������[��������'$9�!cMGH�5��-P]�}�K����7xm�{��!u*HVVV���$�K�n�$�2���]V���n(V�{�e=���!�$��V!�I�I4h���Q���;��0�����G%����-������'$����@sH�EL1��K����Nh ������&#�{���N5iX����:k���GQm��S&}���};����}��=$9� �t	!��y��W^I7W�����K��������yv��$G4��I6�8!������,IzU�g1�����~'�T����\gm�C������! ��= �	@�0B�y'���TZ���$�w������+����'$����JvHr*��{��V���+�J�*��o�����!o�+����b�7,�#��y69Hrr������d��PL�
�jH�=D����'$����S*H�O -o&����E��Jys�|�pM��*����,'3����4��( C ����0�����8WN�VS�/�g��OH��#5���������	�����<�����p������M��uHr����@�"���+W����+A�J�!��}�d?�J6f���<�bKk��}B�CDt�$[P�B4I��������9+
7�RI�i��M�Hr�pi�@H"������L�u��[8I��K>�?��c�����]"��(��L��D���������#v�I2]@��0�87�3����EZ�n*�����c���8���sG�J�dI.*�o|���M����q�$���� DEI������}����s���]?��S��'��<���7n�(��U���L��"�� ���r�n$�({�������7x�;b�&_�������d'���u����_����o~�o���#���-'/@ N��8U;�\�M<K�n]*�l�K^=�� IDAT^r�3�>E�$�8��
I6�8!�f�$'���#��r��Y��w�����}M�{��-���f��������s����V�� �KI6�6�F�m��lK�X!C�/�L�3������6��G��+iJ	!�)as�![%9Q��&_S>]ZX�����~#���V�LiZ���XV�LiVD�u#1�{�Ym�t[��#����� `$���X���dP�[r���`��N�T�8H��C���d_0Z����\T�G��H&}��p��=���N5iX����:���j�
p���g���f�K*������FYF����L�If�N {�Q���K��?N�[�Q��������{�)4�$���G�d��BJ�Hr"u}�X�UE�2\=���%���3�Rn��$�^!�� P6$�lF�����L��������y��;��(
$9�����H���M"5�$9�������{�9�hYF�4@ 4Hrh���Q�����+��u����{���O}���Iv�����$'���'�����;`�`�}_Zg�M�e$9��@���"�$�E:����+��;x�f}�U��>��c��I��@N2E$9I`���$_��63Q�����=@��4��?tzgyb�'r���2k��N��$�h@�3U$����
I����r�f����MX��$�7�	�@P���������GJ~�fKN�9;�s��'$9�����d��@��	Y~c�����U�{�94RYF���@�oH��Dc�����Sz�&�~�%��
t��'$9�����d������G|��;ky���^0Q�r��Urh��R�b�h��; �LIN%�clk�Y�AZ��_�\}���>!���=	 ��	$��1P�,w9����t����M�6n��=�3�K�n��{���_K\�,� �����nhs�w� �GI��f�E����d�����1�&9���������������o�v��$�Y7��f���DSo�8YNHk���9���o�y����'��G��'���������gN��&�C�@����;���u�q��q�j��Kns�����D����JI.&�oB��+������!o��e�Jp�Q���Y�r����j�M�$�i���������)���G��o�N����~@q!�$�����j���B���<���YN��$8�,mI��p>��$����&�T&}����f��fz���NuI�0����Cv���>\��{Wz�7+�6i'n�1� g ���2�D��}��[��4��!_-���c���h����"�&W'�����X��Sq�[wl��xN����lzA��%$9�e�/������U�\q�|��
��}B��;���$�R���@�������U������-�4����ls�"�=�L�~����U ?��ErOk!��qT�u�;���6�'$�����k�{*�9���/���K�].����|�����d�RN��k#Sw���������c��IipY�
�lQ�I�eM�G������2���l�eY�	p���v}�n��e�=%���+���r������'$9�!em�H����5p$�W�V7��$'��M�ts�)E6���{�94����J��" �$G��.���S��x[�l�(�,�76�>!�.�^s@���ikkH����?�T%9Ib�/�a���g97��%]Z7f�/��E���%�$30R"�3�H��(r��?����cs����pq�!$����;9$�����1]IN*i����K�����W�����9�7��-��q����,*��ql�!ztn����sA�$9U�9��_���/�H�]�dE��X��$�<�hIv��>��$���&����8���!o��e���n�����,��&�odY��J�'�$^ �Kl�u��������:�	I6qDF�-SzG�M�D�q!������!o��m��d3nq���=���Y�U-������������R&}��p����}C��X��,|���!�$[S*3���[i�JG/��+��m��yM�.�(8*d��v�$Z���B�CnpwAJr"m���"�]�R���U��oK���us�����r���Kn�F)/�6������������iW����I�M���%�cw�SQfHr*#��g�d�j�JFHr*��|&I��\q����e�x��2��EI��dL��y��W9`�iR������$����$����%�$3���8H��$'r-n���r�6��f�6#��M�d��q����_r�{N��e�,�70��>1�� 4�{$���D�tC���$�e�x6t@ �$9%l�|���df�9x�!�j������'�_�f&9��/LIf0��(J ������3�uG���
����&��U�d��]����H���^�n���N��=���$�����;��8@�H��c��3�uG�3���&_����'�$_"3:�yb�r������;�Yd=�1����#�Tw,�fD$�y&��QP�&_����@�dH���1%���%��V�_�������}L���T�$H�)4�8�I���)�#��T"�8���k����-�pM�
l��x%@�xH2#�LS��E�-yY��YA��/��>!�e����$����&�$3�I.������Hf|�r7YVa���� �@I6�
���;�X�]a���(���b}��l�`� <$9�v�$X��Bb&�|��|M�ti�
�T��u�{�G�����]M��g�k�� �OIN���-��M���n��
d���e|�?�I��g���C���!��F���H���M*9$9)\R�&_%��D'���!��	 �������g����r������������EG�c�W��$��2��$9e.W�Hr�0{��./]�Y�^�����l��-e6�D��� �r@��	*��m]/�����*H�E�K��.���OHr,�&��4���PH2� AIn, ����e@`wH2#�DS�_-���G�o���ep����I�/�@�H2c�($9���DG���!�$�����O��#%��H��'I�f�s�����r�)$��Z��	3���s�Y$��z"����� `$���O�gOI���I��]�r�@�}*�.�$2X#I���t�$R�@�
(B�!�-����b$<DMI�����oTK���I]�P��s�>!������B����	 �&T���d3�D�J���	"��B �$���.6��V��w�6�����<�����$�6�C�I
��!�F�'����Pq������aF�G0�@�	 ��g�;���B��9pKUY�~ �>�Py�[���D��"��%�$3�d� �	 ����������ZI~�
��eg�u�=9�	I�w�9���XASLIN���!��� ��@�cW���������s���
�f� �}*3���QH2���d�@QH2�����d�k�k��7������5��q����OH�������d��|N�$'���'�dW+K^�@�8%�����A�I^^���][���#��5+���v����{K�F��g���}]�v�;v��Q�v{n�����cGi����v�mN���K?�Vo]�����������a�jN��GR�$�A��6�d�k�GH��hIv��d���S���H�z���+���s���d���R�J�b�<y�d�����A��Ir���G���i���^{�5�6m�w���<t�9�����M�d�Z�r�S���x�Ld�$3���8H@���'��$��p�v�d��	�R|��w��g�)'�x�o*�t�R8p�t��A�����$���N�U�V��>�3�-Z���&�����+c��0�����?t�[��T���w.�X�Kq=&+##�>h�|^�:��K@�X�Y��M��
��������XJ�I^�z����K��WX�g�yF��.�.z�/3��[�n���/���n�|���K��}��g����LY�b��{��^;����n��?]��y�f���u�����\���w��.�h�_W)�Kq���o|�����SJZ�b|��-[�H�5|i�F�%���?K�j������9����� :g$y��e��oyu�;v����.�.zM�4��e��+���wOI���\tF���O���V�ZR�N�5k����{<}���,������-�����>m�����U,�xYnW�Ze��m.^�[���!�E�I^�f�t��]��_�n��a��^x����d�<x�<����,qI����{�w�}�t��E��-�vQ���'�_�������#Y�*�5��I��t��$������dkK�{�H��Hi�@���d]���}{3f���Y��l�}����u�B��=����r����;wz�������SOy�[�L�.����K�
�f��!�~�
���<p�i2����hC=9��`y0�����"�6V����d����"�lc���I�+�B��3���z�!o��S�N�����Z�r�w�{�1�������J�I�[������3���U����u?H������W�+��s�S9�"����mH��.gzHr9A��6$9E&E@�yNI��M��s������&�,p������sss������D.�$��k���8qb��l�f���p�������^Y����L���;�;�� ����9���XASLIN���!��� ��pJ�cW=�}�2�r��������n������j<�@��Q���D��"��#���sy�D��C�{ �MI6�>�F���P���	�^ Rq�����	�?�G�]�hj� ��qs�)$�������:;�� `
$��JDG�Q-ej�&i���t<n(�>%Y$9I`���$;Z�$�B�����H���%5@ 6����z�D7.�H�g�zv�\r����r�}Jr, �Is�v$���&���$0�oG�..�A�!�$����':����o�<i���4��)�}Ja �)@s�$�������4GA�-,iA�"�$����&�u���k)y�*J��g��k�������4A�,j
)!�)@s�$���� +Hr����dg��Or�L�j"�w<��O)�$9Ep�=�$;V��A�S��cH��E%%@ v����\��SG��*"�_{��x�P�}Jq �)�s�1$�������"8C�,*)A�#�$�����V���������bF��t�d�X���ls���I����-!��W��!� �1c�m+�v.�6�%����Oi�IN�C�"�3�T��4�9�(��XAI�%$9Ne_��dO����L���by�����F���4�9�(��P1�HIN�c�"���t �X@�cT�������o�^;3���Q��f���4:�8��H!�LIN�C�#��T ��@�cT�O.3�V�}V-cr����4k�$�	����dG
�fHr�zIv���������<T��}����m����O�I����$�����d��\�HrplmkI��b�@���������N�'2�K��
���#9����#�>@t�	$��"����DG�@�)$i@�&�$���E6��{�U�N��q�:����[��lE�I�5 ����@!�HI���x{RG���[o���M���O>�I�	��� ������d�@:���@I�=$��!�u�t{�|R9S�����u�C��S��d�@Z��ly}
I�	�� �� ��@��
�������<�������k�{B�M�Nx�!���6�'$��
 ��	 �e3����<R��*��]u�(M�gY��I�#�&U#�X������3�lR5��I��?�C����ES�X����y�l�PA��_�v�15R+�B��,��A#��#��A$���4�V� *$9T��v6����o�<����\����a�8���x����\NP���$;^�$�C������%�$Z���Z���Lj+y�+����'o��/�&i`wH2#B	 ��%�$3�d� �	 ��������Ir��*U
2���'K�6�9�iti!���7�g$��jD�{�zF�M��@H���<3+����#eZ���6�i�������$��&�d�@�HrP-mI��p�
@�$�����ki��E��]�T{X���t��<'$9��lD"I����$S
� �2$9et�>�����s��������9oq�S@�B�kY�H�e(\$9 �6�$[X4B� �$��!q��&�N��r�����n�:��)!�f�!�(���+`F�H�u0!
$��* �� ���3���s&K�O����r�3��O-������dW*�^Hrz�\yIv�����$��� DMI��>�_�l�g;�#
�T���K@�JIf($�q� �$3 �OI����e�}Xy�Zei���R�{���t�d��U4HrT���I6�QF�$GI��!�CI����]j���x���#���� �dW+�\^Hrr�\�Iv�����$'��' �FI6�"i���	���m�����e�%���:
��yI.%��A���qy2D��C)� ���3YBn@�]�����}�	�R��rd\���dflH���	50$9T��v�$[��C�CGN��|'�$��4�7~:BZ�w�����,���$�@b�+��b��*��8PH2� AIf,@���l
�F�=C���ZO��!Y�*9���i ���&����0i���lnm��I�8�A����?��[��^r������Xj"}$9D�w�$\�CC�C�mxWH��"<@� �$�����<T��}�����UI���=!�&W'�����X���lz���@������1md������}����XjR����@�
�lx�B
I	�� ��!�AI�}�Yj}����\��gdM�H�5�
4P$9P��4�$[S��E�GL�'�$�8�K��d��'�!m���CZ/$�$3���8PH2� AIf,@���ly
G�8I��^���������+|$��z-�Y��E���W��"�A��m@�@���L/�~��Im�]�O�t���[0��j��d3���$���L2c��;�d�k�?k�d/',����Hr4�M���d�*M<Hr4�M���d�BL��#�$'�����{�8_y��~c#y��KF��`��8T�������$9U._�Hr�8q �	 �&W����� �S�%?3S��r���(��L��I��t��$������dkK�{�H��Hi�@�������a��wK��IR� C�cWk�&�
��$0GoG�-l�i!�Is�v$���� H���N,�n���L�n��Y�6�lw���I����� �v�����d?i� �h ��pO��"K���(�������� �)as�!$�������6'B��,+IA1#�$[X����$g�,�kg�<y����~��Y�2�l
��I����m ������d�H� �� ���O����!�U��Z�����d_0Z��l}	}II��� �N��$ ��@�m+����;z�Z��:��!���7�w$��JD�-�zG�M��@H�������=�����P�vV���z����UBI��A]#�#�P����5�lXA�@
���E��������2d�M��[���2����$�~x�d��@�	H2c����d�j��ki��E��B�`��r�Y�6E�\�H�s%M)!$9%l�=�$;W��B�SF����!�$S��y{jg���o���vH�:��~�;#�$�����d��X�Hr`h�kI��d@�7�d��mC�������O����(r7CE���k�Y!��s�~$�������
5�� `$��z�M�ti������]|Fz���-�;'��li�JIN
��7#���6������� � ������fL���[�d��UeL��Xjm@��d�`@H�E0 $��"hE IDAT�lH!�@��4������6��jT������k��5}�@Ifh($�q��d�A���X� `?$������*!�F�%�������!�ldY"	
I�;�B����+�`���vr�������2���,�s��"�I#s�$���&���42g@��--�A1"�$[P�������*K��seb��-�8!"���sYY"�e�����x��<Y"����=��&�$�]���i��!^��-]Zgx��	I�O�K�If($�q� �$3 �OI6��/�=S�,�}6��;���j��8>�!���5�L��"�$�E(>_G��Sk2��%�$�\�������df��rtA������hc����03��f�E	 ��@�@�
�a���7�a�\vB���_hHr�j^\�H2�If ��@n@�
���1g���������r_c��a�B�
+HD� ��7�[�[V��a&9B�t
@�'H�O }o��R��2������{4�$9=~�<�$�R���@������H�K�$@ ���������A�$//Oj��-}���f�����3g����{N��^{�%�]w�u�Q�}]�v�;v��Q�v{n�����cGi����v�m������J��Ox�8�E��Q�'$GIN���w#��V6�����x�|7��ru�
��$���z����W^)s��������J�*��S���O����O���+���'JFF�'�+V�=zH��M�{���d��i��A�P$y��62d�J���~2���q�V��$[U���E�CkU�H�U�
4X$9P�4@ �H��]��]�v2a��B)���;��3��O<�T�\p��7Nj���I�i��&�V���={>��woi���,X�`7I�Yg����K����������W��{����@`��-R�re�C����}����=?��/�xf^PP ��I����	��	����U�J�
�s�*qlg���[L�I^�z����������3�HVV��L��k���r�]w���#�[T�o��v����<������)+V��{���k�����M�7m��{�W4D�[������ g7=��>h0};w��YW p�����^�B�1���%:��?3��M��
����u�����3��l�2������:q�;V�X]~]��?�n��f���)�mVI8p�<�������'���w�Z��N�:2k����[�|��<Tu��qy��+v��D�r��)����bp��cP�r��r�r��6@pF���Y#��w�����6l�'�^x�oJ�������,_|q������{�w�}�t��E�"_|�E��������K��� c/�5x��;4$9��Od�$3���8H@���'��$���o�^����[���B}��gK���w���;x�`O���������d]�}���zx��1C��f]j�Lr�����p�T)���O�)'�������dG�dZHr���Iv��)��$��G F�IV�=��'��:u�v����z�S�j������/������n��B������u�z��qQz���v��Xj������k�6\�($����d��@�	H2c�����d�DK�I�3g�7������7��877W���~oS���;��;�����E%Y�\�l���.�E�Z��y�k_����;72@���c�Y ��t�y$��:����E�� -�$9Z������M���Uo������i������ �����a$��r,�Z�F��+C�
$��Aq��#d|�a��AE)%$��:%�4a;�G���SQ"�aP�@�@���[��W|-�/w���La�u��Ey'�%}s�F���E�� �Q�7�o$��z
 �T ��P�����$g�,�\PA�f�b�u��nI�����!�v�����d�����lo���@��l�X���#dB��w����uc��0J#�$3>���8PH2� AIf,@���lB
�,�����r����1�AIf� ���$���$3 �CI6�����Y;+��vH�:����"�$�E(_g&9u.+K$�,B��:3���5�B�@�
�mb�u�-�����2 "B($�<���Iv����I.�x��$���d	�MI���,���)��$�����d���r2Hr���{Iv��$���q�Yjq��IN�C�"�3�T��4�9�(��XAI�%$9��'�Z7�r����g#����!�$'C��{�dwk�LfHr2���Iv��d���e��� �S�%?3S��:�B��7��7��B�]�hj� ��qs�)$���� 7Hr��{�n�Y<I���)����]�#�E*]#��Ps�$�������
57�A���+YA�"�$GX�{�<N���Cm=X^���#��T ��Ps�$�������
57�A���+YA�"�$GU�u?H����W���_��ro��QEB�)@�S��cH�cM1$9Ep>�$;XTR�bGI�����J��qR� CFg�/M�gE	��JIN��[�!�n�3�l��T�����^M��$9��?1��24s�4�|�L�������t ���s�Y$��Z��	��=��E���'�@�$�$GQ���%glo����w��r��"
�L���&@GG�)d�i �it�q$��b�
 [Hr���x�d�K�#`�g�H��4�mI��v~F�$�I����d��G�����8xfTy$c��s}����"�K? �~P��
$������E7�@���#Y@�&�$GP��O7�y�+��U.��/�5���H��oI���~d�$�A��6�d7�H�@�	 �!���������^G���@��#�;� �~���$����=��I��A���!@@�C��] �����e����C����$�$�I����d{k�g�H��4�nI��~D@@	 �!���O4�w�Wf�u����I��}m"���,���� ���&�lg���@QHr��a������^��}Q�r��!�NW~@��&jg{H��u�;j$�o����$�[;"� � �$�8��t����k9��Z�B���3]AI��}m"���,���� ���&�lg���@QHr������dV�
rr�2��{B���� �$A��6�d�jD�HrT�lI��nD
@I�`l\���z�*��~G���;:�(��OH��4�mI��v~F�$�I����d��G���3�!���R�z�+������+�II��=m#���*�H�� ���6�lW���@q����E�'������?���:?R�t$$9H����$�S� #E���kW�H�]�"Z@HrTc`�����dc�
,�����$��&�d�@�HrP-mI��p�
@�f�C/��[��z���:�!v�$�����d��bhHr��
�
I6�@�@���r@J���R���[�������BI6���$G\C�G�
)�a �� �IIN`��o]/�����Z�x�?���O)�n���lG���I���#�v�)�(��0(� �`	 �������]�.}Q�o��I�},Y�*�#��EI���� �f�'�����H���l~���@Y��������[���
r���dt�)i���&@�M�Ft� ���7�g$��jD�-z� �$��%��u�d?{��gf�u���N=+��h;dHr��
�I6�0!��$�����d��Ch��II.'�Tn����r������
2��OXj�
D��A�
.N��!�!�6�+$�����2p�� $9��&��<]�V�QN��X�w�`O4$9
����$�W�("B���nf�H��u!*@�@������m�7����W�;�K�I>���@�M�P8�!��p6�$��
��kz� $9 �_|4L.�����SdL���a�j�D�Q@��"oV�H�Y��*$9*����$�W"� �,$9Yb��?�������k���)n���lS���I��M-#�6U+�X��`��: �0 �QN,���v'���>�B�Q@���oN�H�9��2$9J�f��$�U�� �
$9je<��h�d��C���)�f�u�MhI6�
���$G_"@�M��1 �f��( �CIN�^	�>:���t�9vs]s���@�&@�M�B�1 ������d�`FH�u 
@�@���W����9B�W��Z��&�d��a(Hr��
�I6���$G\�� �$��E�H,��QP �/����>�5�9$��jD�{�zF�M�F�� ����w@~@���X��'v�Q[?�6g��k?��u�3��lR5��I���I=#�&U#�X��h��; ? �~P,���%�U�%�U:Gn���[�9� �&U#�X������3�lR5��I��?�C�����_����D�gt��o�i������u�2��lZE��I���i�"��U$�x�����3 � �~��S�&C6} M�T������i�DH��U	?&$9|�&��$�X�hbB���N���$�$�H�����GU�K�]���N�|l��L$�$�X��cB��gnb�H��U�&&$9��
@�OH�O47nX&�^8�kmt��r�!M|j�fL%�$�Z�p�B���mjoH���	?.$9|��@�oH�ODGO�#���!Gm�"�^��O�����d��^lHrx�M�	I6�:���$���� AI��j��-dv��,����
� �6T)�������lC���I�3�@���]�Z���&�d�@�HrP-lI��h��$�f!�HI����n��VN�F;D^��kZ�	 �6T)�������lC���I�3�@����nO�"�*���w!��C�4a$��*#�<cz@�m�R81"��p�@A@�}��r���B�8d���u�-��
�d�|�Hr��m�I��J���$���^ IIN������~K�K��"/ue�u�8�zI��\��$�����d��h�Hr�xi�@(��41�q�����t�z��q��4[�q� �6U+�X������2�lS���I�/�C���&��R���K��S�4[�q� �6U+�X������2�lS���I�/�C���&�#G)m/��]�����FI��b���$���V�d�*\�Hrpli�@X��4I�$����<p�ki����@�m�X0�"��p��U$���/�[Z� $9M�*�w����k{M�-��m�d�*L�Hr0\mkI��b���$���!�EIN��������L����lc���I����-"�6V-����`��* �0	 �a��/� �N�3�d����9� ��T9�JINC0��lD�FH��U�?f$��6��$�X�`bF���J���$�$�I���"�$;U���A�SF���H�S�L+$9-|<@�H�e 	 �6V����d����"�lc���I�+�B��\��s��������~��:Hn��V�[�n�5�/C	 ��&��������lha"I�:]B���\
����������C�?�xy�����>�{����2����d���H��LmlI��j���$��V!�II.�v^^�<����#�xw���K:v�(�F��5jx��o�0�E_�_�+T� EE(a������_��A�'dff��{v��)+V�g�A���@���\J��z�-o���_����{wi�����_|ai�	;]�7o�*U�x�q��������t,p���~X��
�W�_d���
U�V��3��s�1P�,%�$�R�W^yE,X�-�N\7�x�\v�e|��t��6����io[,���v~F�rk?i�������CPHr)�������?�Xn��������Z����~�����@�c>~MIf($�q� �$3 �OI.������!C����C�����:��1c�V�Z�W��"�$������dgJ�V"HrZ��zIv��$���\J��=�.]�H�n�
w������F1���g�H2A	 ��f�E	 ��@�@������e��A���?��-��W����'��	 �i#t�$��2��3�i#t�$��R� cHr��O��@������H�+�L/$9=~.=�$�TMr��JI�k��;mHr��hIv��i'�$�����dgJI"�@�	 �1.>��GIN��+O#��T2�<������4��R5��+$9��'��	 �i#t�$��2����6Bg@��)%�@1&�$�����$9=~�<�$�R���@������H�K�$@ ����V���&�$�����d'��vHr��iIv��$������zz������4��J%��IN��KO#�.U�\ ��@��Zy�N���6B'@��(c�I �i#t�$��R� cHr��O��@������H�+�L/$9=~.=�$�TMr��JI�k��;mHr��hIv��i'�$�����dgJI"�@�	 �1.>�C� @����dF @� @�WH2C� @� �$3 @� @�`&�@� @`&�1����k��}�����Y��m��&'�x����x�	y����R�Jr�%��y��W����
[�l��z�����
c.������A�I^^���][���#��5�"_�,��G}$��w�<��Cr��{7���A���o��Q�d���r�I'I�=�B�
`�����3������z�������N�:�(/��s��������~��:Hn��V�[�n�_�E���?�<����h�"�Z����������1�3g��=_'H\&L�5j?b=lH�2�$[V0�
���t�
7�3�<���_}�U�_��������>����C
?Pz����U������iSY�z�n�\Z�x��W��\y���/�����K�*U|���!0y�d�����=���w�$���a���r�M7��!Cd����~�t��!����w*���~����~��W_y�'N��}X��S'�C���?^^x�������
%~��i0��W�
�?�C�������F�����/���\������
���N �$��4�2�%K����>*���?~���(�k�N�7o�}M1�����k�qIlr��y�7S�c���Eg�K���W_��	�9HH��w�)g�y�����N_|�����7���B��I.������?�,���^�.�f�j'��
�.�@��'����*z��G�{T�;v���"�1R��tv��~w�q��m�VZ�n-��O�u���e�]�[b:&��`�� >�����LS$0o�<o6H�����/��E��^]b�3���Vg��={�����r�������H@g��N���$�T{]M��W/���������_���&�����$���A�eq��/�z��k�%Y����������	#G����z��9���oab:N�w��	tI_k���� b����;��~�a�g�}d��I2e��\���{�����-���H�l��`!}� �V��`�"�3���W�M�6y������&]t�E2l�0��4�t����/�\�(N�K�����K�uyu�;v���R������������A�_�j�JN=����U�_�u����D��W1�U�.��y��Wd���r��u��7z�����%~��c��9I�����:+��{�~ �����Yf����qc~6�_n2�$9F�&U���.�=z����n�S�~}�q}gQg��I���)�'�%�^�W�����������SG.��BSR"�	'�E�*��A�E�w��:�,��i]M�Lr��
yL����x]Mp��{Q������r�-�Q^{����gOO�J���.{	�
#���w�-+V,6��_~���BE��
�����GI�_��8M���'�O>���~��r�9��k��������IDAT�{K��u��f/<n��$����/�����1c�H��5�4�C�!]�W���@Y�\���n��f���&�EYW���.;	� �{����UW]U���v��$�7�Wst�6�>�b���V�V-;A��"��7��6o+mS�i��yK�������]�d��E��_~t9��k�;��^�lJ���{h��2�����r�#�<2�H�2(�Iri�W�_�u�Y�H�_�2|]��e7�=%���+W��v�����[������)�7��{�[F��k�� �<w�����4������o��Ei_�/I�3�S-t�}q��;[�����^����}C�
�g��u'�x@��Uo�M��n����S��\��X�/�eS�s�uS��;�:s��L)@6���$YC.�����n���e�l�������T	�{JrY���i=Ou��m��?��7���:��s����������d��\���	p���Z�������C�H��=+����=^?0��
tO�t�����M�������=�d{jE�� @� 0$9`�4@� @�@����B� @�@�����< @� �CI��VD
@� @@�L�� @� `$��Z) @� LI0�C� @��=�d{jE�� @� 0$9`�4@� @�@����B� @�@�����< @� �CI��VD
@� @@�L�� @� `$��Z) @� LI0�C� @��=�d{jE�� @� 0$9`�4@� @�@����B� @�@�����< @� �CI��VD
@� @@�L�� @� `$��Z) @� LI0�C� @��=�d{jE�� @� 0$9`�4@� @�@����B� @�@�����< <����u�V���g����;Wz�!Y�t��z��r�M7�+�?�P�z�)1bD���&@� �
$9j<@ I���?D%��g��:u���t��������>8�V��I�|���K�V��C�����U�\��)�*���_�e���z���f��!�7o����/e���!@p���pqI
0��J�{��'������H��m��G��CQ	2d�t�AI���$����r��GJ����j������^9��#�d_h� @�^H���#r@�"*�������_��o�QN9������$�\�R.��2y����b�����q�2z�hOuV�R�J�l�2�7o����~�����L��
�J�*��{w9��c�6��M�6��;��O>�_~�E��k�������W�#�<"������^r�9��E]$�e����K�����8P���/�?��oDw���2|�py�����?�p/�����>(yyyR�fM�P���5�73��V���������[�h!����������u�&999r�i�y9��3�{�����8u��EN<�D�k��K/�$M�6�1Vz�
7����������L�\���m��c��+����EcR&g�y�\q��<B� @ YHr����@
T�9�OH����dVV��R*����/���3�*��^{�����2x�`O��|�MO������J�����O�>��?���������>]�����|���K���e��5r�wx������3�*�z���*{���'�E/���/���3�e�*�&L��z����g�}�:T5j���v�]���aCOF�}���C���~�zO�5���;�f�O�_�QI�|������c�1�
:�������Iub���%K��w����_���=�o��Y
#�G @���lK�����
m���=�T����~����*�,�$9q����N_z���������+�?��'�*�������+|��w���u����T4����5m�4y��w=ITI>����w��r�Yg[�EV���u9q�3'�|���pY����_{2�b���E��f��{�9Y�b�_����{���d[�Ui����v�~����S�uFZ%Yg���o?���C� P~Hr�Yq' ��	������d����W_-��v���T*3����o�������y���-N\{.�^�n��|���_O�@���W_}�[r\tfXguUuItB�K����U���zK���3l�0o��^�z�)�/���������Wi����������+g}V���7n��u���[�$���L�.�>����]�vI�G����A@�"#�$G���!�8(*����^N�4�[v��u���n]�;���o��<}'y���T�/������$y���r�-�~}���r�u�y��r��A�Hw%$Y��n���������m�;��K%Y����*m��������*�{^�I�����]�v-6�����x��dmL����/�������_��K�� ��@�cWr�� ��$�,�������Z��$��.��:uj�;�*s*�J�W_}�ml��ts�g�}�;wX�9��o�9�M����#���[c����\n��uk�keI�_|�-�V�
��^�I����=��hm�<��3��NvbY��t�2}o[?�w�� @p���f]�
0��������P����H}���s���b���w�uYw������d�
��^�m���=���u�^*�*��~s�����~��mn��y�r��f�������q��8����/��'��������;L���Jo�3���c�)s�.�_���e���Z�9�
����H����3����~���O?y}��h����������� w	 ����� �'��nP�K�uI��$����:��;)�l��&���k%��Je����n�5k�,O>�%��U)�8>��c���z��I�N��M��3�����R*�������I�&��f�
;q�&�z�����e��K�����*�e�R��l����h=vJ�x��]��d��.C�uFY?���|��>���R�@����dwkKf� @� �$$9I`�@� @�@���-�A� @�@���$�q; @� �KIv��d@� @I@����� @� �.$���� @� $IIN�C� @����dwkKf� @� �$$9I`�@� @�@���-�A� @�@���$�q; @� �KIv��d@� @I@����� @� �.$���� @� $IIN�C� @����dwkKf� @� �$$9I`�@� @�@���-�A� @�@���$�q; @� �KIv��d@� @I@����� @� �.$���� @� $IIN�C� @����dwkKf� @� �$$9I`�@� @�@���-�A� @�@���$�q; @� �K��i���rPIEND�B`�
pinunpin-cas-original-fix.patchapplication/octet-stream; name=pinunpin-cas-original-fix.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 3ae2848..3e70792
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 95,106 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 95,103 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 8c0358e..ec96bc3
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 51,56 ****
--- 51,58 ----
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
+ #define likely(x)       __builtin_expect((x),1)
+ #define unlikely(x)     __builtin_expect((x),0)
  
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 774,782 ****
  		 */
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 776,788 ----
  		 */
  		if (isLocalBuf)
  		{
+ 			uint32		state;
+ 
+ 			state = pg_atomic_read_u32(&bufHdr->state);
  			/* Only need to adjust flags */
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 788,795 ****
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
--- 794,801 ----
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 807,813 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 813,819 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 885,891 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 891,897 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 939,945 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	volatile BufferDesc *buf;
  	bool		valid;
--- 945,951 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	volatile BufferDesc *buf;
  	bool		valid;
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1013,1022 ****
  		 */
  		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
--- 1019,1028 ----
  		 */
  		buf = StrategyGetBuffer(strategy);
  
! 		Assert((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1210,1217 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1216,1224 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
! 		if ((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 1 &&
! 			!(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1232,1243 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
  	UnlockBufHdr(buf);
  
--- 1239,1257 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	pg_atomic_fetch_and_u32(&buf->state,
! 							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
! 							  BM_PERMANENT |
! 							  BUF_USAGECOUNT_MASK));
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID | BM_PERMANENT |
! 							   BUF_USAGECOUNT_ONE);
  	else
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID |
! 							   BUF_USAGECOUNT_ONE);
  
  	UnlockBufHdr(buf);
  
*************** InvalidateBuffer(volatile BufferDesc *bu
*** 1286,1292 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
--- 1300,1306 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
*************** retry:
*** 1329,1335 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1343,1349 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if ((pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1344,1353 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
  
  	UnlockBufHdr(buf);
  
--- 1358,1366 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = pg_atomic_read_u32(&buf->state) & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
  
  	UnlockBufHdr(buf);
  
*************** MarkBufferDirty(Buffer buffer)
*** 1399,1410 ****
  
  	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1412,1423 ----
  
  	LockBufHdr(bufHdr);
  
! 	Assert((pg_atomic_read_u32(&bufHdr->state) & BUF_REFCOUNT_MASK) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1412,1418 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
--- 1425,1432 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	pg_atomic_fetch_or_u32(&bufHdr->state,
! 						   BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
*************** PinBuffer(volatile BufferDesc *buf, Buff
*** 1495,1517 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
- 		result = (buf->flags & BM_VALID) != 0;
- 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1509,1548 ----
  
  	if (ref == NULL)
  	{
+ 		uint32 state;
+ 		uint32 oldstate;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (unlikely(state & BM_LOCKED))
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += 1;
! 
! 			/* increase usagecount unless already max */
! 			if (((state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			result = (state & BM_VALID) != 0;
! 
! 			if (likely(pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state)))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
  	}
  	else
  	{
*************** PinBuffer_Locked(volatile BufferDesc *bu
*** 1558,1564 ****
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
--- 1589,1595 ----
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	pg_atomic_fetch_add_u32(&buf->state, 1);
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
*************** UnpinBuffer(volatile BufferDesc *buf, bo
*** 1594,1623 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		LockBufHdr(buf);
! 
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1625,1665 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32 state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, 1);
! 		Assert((state & BUF_REFCOUNT_MASK) > 0);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			LockBufHdr(buf);
  
! 			if (pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER &&
! 				(pg_atomic_read_u32(&buf->state) & BUF_REFCOUNT_MASK) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** BufferSync(int flags)
*** 1680,1688 ****
  		 */
  		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  			num_to_write++;
  		}
  
--- 1722,1731 ----
  		 */
  		LockBufHdr(bufHdr);
  
! 		if ((pg_atomic_read_u32(&bufHdr->state) & mask) == mask)
  		{
! 			pg_atomic_fetch_or_u32(&bufHdr->state,
! 								   BM_CHECKPOINT_NEEDED);
  			num_to_write++;
  		}
  
*************** BufferSync(int flags)
*** 1721,1727 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
--- 1764,1770 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2081,2086 ****
--- 2124,2130 ----
  {
  	volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  
  	ReservePrivateRefCountEntry();
  
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2095,2101 ****
  	 */
  	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
--- 2139,2148 ----
  	 */
  	LockBufHdr(bufHdr);
  
! 	state = pg_atomic_read_u32(&bufHdr->state);
! 
! 	if ((state & BUF_REFCOUNT_MASK) == 0 &&
! 		(state & BUF_USAGECOUNT_MASK) == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2104,2110 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2151,2157 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2256,2261 ****
--- 2303,2309 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2273,2284 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2321,2333 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 state & BUF_REFCOUNT_MASK, loccount);
  	pfree(path);
  }
  
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 2424,2430 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
  
  	/*
--- 2473,2479 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
  	UnlockBufHdr(buf);
  
  	/*
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 2444,2450 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2493,2499 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2538,2544 ****
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2587,2593 ----
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** FlushRelationBuffers(Relation rel)
*** 2874,2880 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 2923,2930 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 2895,2901 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 2945,2951 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 2923,2929 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 2973,2980 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 2975,2981 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 3026,3033 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3093,3104 ****
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3145,3157 ----
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3109,3115 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3162,3168 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3149,3156 ****
  		}
  
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3202,3213 ----
  		}
  
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert((state & BUF_REFCOUNT_MASK) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3170,3176 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
--- 3227,3235 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
! 
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
*************** UnlockBuffers(void)
*** 3208,3216 ****
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
  		UnlockBufHdr(buf);
  
--- 3267,3275 ----
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
  
  		UnlockBufHdr(buf);
  
*************** LockBufferForCleanup(Buffer buffer)
*** 3304,3328 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 3363,3392 ----
  
  	for (;;)
  	{
+ 		int		state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert((state & BUF_REFCOUNT_MASK) > 0);
! 		if ((state & BUF_REFCOUNT_MASK) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
*************** LockBufferForCleanup(Buffer buffer)
*** 3349,3357 ****
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
--- 3413,3421 ----
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
*************** bool
*** 3393,3414 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	volatile BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3457,3481 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	volatile BufferDesc *bufHdr;
+ 	uint32		refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3417,3424 ****
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
  	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3484,3493 ----
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
  	LockBufHdr(bufHdr);
! 
! 	refcount = pg_atomic_read_u32(&bufHdr->state) & BUF_REFCOUNT_MASK;
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(volatile BufferDesc *buf)
*** 3456,3462 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
--- 3525,3531 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
*************** WaitIO(volatile BufferDesc *buf)
*** 3464,3472 ****
  		 * play it safe.
  		 */
  		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
--- 3533,3542 ----
  		 * play it safe.
  		 */
  		LockBufHdr(buf);
! 		state = pg_atomic_read_u32(&buf->state);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
*************** WaitIO(volatile BufferDesc *buf)
*** 3494,3499 ****
--- 3564,3571 ----
  static bool
  StartBufferIO(volatile BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3506,3512 ****
  
  		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3578,3586 ----
  
  		LockBufHdr(buf);
  
! 		state = pg_atomic_read_u32(&buf->state);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3522,3528 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3596,3602 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(volatile BufferDesc *buf, 
*** 3530,3536 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
  
  	UnlockBufHdr(buf);
  
--- 3604,3610 ----
  		return false;
  	}
  
! 	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
  
  	UnlockBufHdr(buf);
  
*************** TerminateBufferIO(volatile BufferDesc *b
*** 3565,3575 ****
  
  	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
  
--- 3639,3651 ----
  
  	LockBufHdr(buf);
  
! 	Assert(pg_atomic_read_u32(&buf->state) & BM_IO_IN_PROGRESS);
! 
! 	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
! 	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
! 		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
! 
! 	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
  
  	UnlockBufHdr(buf);
  
*************** AbortBufferIO(void)
*** 3603,3625 ****
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
  		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
  
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
--- 3679,3702 ----
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
  		LockBufHdr(buf);
! 		Assert(pg_atomic_read_u32(&buf->state) & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(pg_atomic_read_u32(&buf->state) & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(pg_atomic_read_u32(&buf->state) & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			uint32		state;
  
! 			state = pg_atomic_read_u32(&buf->state);
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
*************** rnode_comparator(const void *p1, const v
*** 3701,3703 ****
--- 3778,3810 ----
  	else
  		return 0;
  }
+ 
+ void
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (unlikely(state & BM_LOCKED))
+ 		{
+ 			pg_spin_delay();
+ 			state = pg_atomic_read_u32(&desc->state);
+ 
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ }
+ 
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_fetch_sub_u32(&desc->state, BM_LOCKED);
+ }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index bc2c773..3f2227b
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 250,257 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 280,286 ****
  			 * of 8.3, but we'd better check anyway.)
  			 */
  			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
--- 282,290 ----
  			 * of 8.3, but we'd better check anyway.)
  			 */
  			LockBufHdr(buf);
! 			state = pg_atomic_read_u32(&buf->state);
! 			if ((state & BUF_REFCOUNT_MASK) == 0
! 				&& (state & BUF_USAGECOUNT_MASK) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 299,305 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 303,313 ****
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
  		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,322 ----
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
  		LockBufHdr(buf);
! 
! 		state = pg_atomic_read_u32(&buf->state);
! 
! 		if ((state & BUF_REFCOUNT_MASK) == 0)
  		{
! 			if ((state & BUF_USAGECOUNT_MASK) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
! 
  				trycounter = NBuffers;
  			}
  			else
*************** GetBufferFromRing(BufferAccessStrategy s
*** 589,594 ****
--- 598,605 ----
  {
  	volatile BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
+ 	uint32		usagecount;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 617,623 ****
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
  	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
--- 628,637 ----
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
  	LockBufHdr(buf);
! 	state = pg_atomic_read_u32(&buf->state);
! 	usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
! 	if ((state & BUF_REFCOUNT_MASK) == 0
! 		&& usagecount <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 3144afe..1e11d71
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,153 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			int	usagecount;
! 
! 			usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
! 
! 			if (usagecount < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 179,193 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			int		usagecount;
! 
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 			usagecount = (state & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT;
! 
! 			if (usagecount > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 209,215 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 227,234 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 245,251 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 254,261 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 268,278 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 287,293 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
  		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 301,313 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_read_u32(&bufHdr->state);
! 
! 	if (!(state & BM_DIRTY))
  		pgBufferUsage.local_blks_dirtied++;
  
! 	state |= BM_DIRTY;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 331,341 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 354,362 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 377,387 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 398,406 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index 521ee1c..92889e6
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 20,48 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 20,59 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * State is:
+  * 10 bit flags
+  * 4 bit usage count
+  * 18 bit refcount
+  */
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_FLAG_MASK 0xFFC00000U
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 137,148 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint16		usage_count;	/* usage counter for clock sweep code */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
! 	slock_t		buf_hdr_lock;	/* protects the above fields */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
--- 148,158 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
! 	/* state of the tag, containing flags, refcount and usagecount */
! 	pg_atomic_uint32 state;
! 
! 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
*************** typedef union BufferDescPadded
*** 192,207 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
!  *
!  * Note: as a general coding rule, if you are using these then you probably
!  * need to be using a volatile-qualified pointer to the buffer header, to
!  * ensure that the compiler doesn't rearrange accesses to the header to
!  * occur before or after the spinlock is acquired/released.
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /* in buf_init.c */
--- 202,212 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern void LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /* in buf_init.c */
#24Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#23)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

Thanks for benchmarking!

On 2015-10-30 16:28:22 +0300, Alexander Korotkov wrote:

pinunpin-cas-original-fix.patch is just original patch by Andres Freund
with fixed bug which causes hang.
Performance comparison on 72-cores Intel server in attached. On this
machine we see no regression in version of patch in previous letter.

So pinunpin-cas-original-fix is my version with a bug fixed, and
pinunpin-cas is what exactly? Your earlier version with the xadd +
cmpxchg?

The results look pretty good. Could you give a few more details about
the hardware and workload (i.e. cpu model number + scale)?

So the plan would be to finish cleaning this up into a committable
shape?

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#24)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi!

On Fri, Oct 30, 2015 at 5:12 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-10-30 16:28:22 +0300, Alexander Korotkov wrote:

pinunpin-cas-original-fix.patch is just original patch by Andres Freund
with fixed bug which causes hang.
Performance comparison on 72-cores Intel server in attached. On this
machine we see no regression in version of patch in previous letter.

So pinunpin-cas-original-fix is my version with a bug fixed, and
pinunpin-cas is what exactly? Your earlier version with the xadd +
cmpxchg?

pinunpin-cas is still just cmpxchg with no xadd. It contain just minor
changes:

Refactored version of atomic state patch is attached. The changes are

following:
1) Macros are used for access refcount and usagecount.
2) likely/unlikely were removed. I think introducing of likely/unlikely
should be a separate patch since it touches portability. Also, I didn't see
any performance effect of this.
3) LockBufHdr returns the state after taking lock. Without using atomic
increments it still can save some loops on skip atomic value reading.

I compare them just to show there is no regression because of these changes.

The results look pretty good. Could you give a few more details about
the hardware and workload (i.e. cpu model number + scale)?

It is 4 socket Intel(R) Xeon(R) CPU E7-8890 v3 @ 2.50GHz, 2 Tb of memory,
all data in shared_buffers, 1000 scale factor, -M prepared, pgbench runs on
the same maching through unix socket.

So the plan would be to finish cleaning this up into a committable
shape?

Yes.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#26Jesper Pedersen
jesper.pedersen@redhat.com
In reply to: Alexander Korotkov (#22)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On 10/29/2015 01:18 PM, Alexander Korotkov wrote:

We got a consensus with Andres that we should commit the CAS version first
and look to other optimizations.
Refactored version of atomic state patch is attached. The changes are
following:
1) Macros are used for access refcount and usagecount.
2) likely/unlikely were removed. I think introducing of likely/unlikely
should be a separate patch since it touches portability. Also, I didn't see
any performance effect of this.
3) LockBufHdr returns the state after taking lock. Without using atomic
increments it still can save some loops on skip atomic value reading.

I have been testing this on a smaller system than yours - 2 socket
Intel(R) Xeon(R) CPU E5-2683 v3 w/ 2 x RAID10 SSD disks (data + xlog),
so focused on a smaller number of clients.

While I saw an improvement for the 'synchronous_commit = on' case -
there is a small regression for 'off', using -M prepared + Unix Domain
Socket. If that is something that should be considered right now.

Maybe it is worth to update the README to mention that the flags are
maintained in an atomic uint32 now.

BTW, there are two CommitFest entries for this submission:

https://commitfest.postgresql.org/7/370/
https://commitfest.postgresql.org/7/408/

Best regards,
Jesper

Attachments:

pinunpin.pngimage/png; name=pinunpin.pngDownload
�PNG


IHDR]T�)	pHYs��:�X�IDATx���|�����h�����R��Q�RD@A)ZQ����"�*�=�HQjE�
��2�P�(eS(�
������td�wyBR���6�������ry�<����{�{�*�J*Bs7�� ���Z�E-�"�r@���\�B.h!���Z�E-�"�r@���\�B.h!���Z�E����x����t��m��s���
J��,
3�~�%�"��?R�4??�������g�}����Z��\h!�_����o������"��E�V),,������������:u������z��1++����������D�aO���W�����������/_�p�����w�y����h���E�*++;���E�>����D"Qyy���3�}���~����z��������mkb�M��>%�H^y�r����|�����]\\�����C.���Co���O����j�&����{�{�����������o�U
���+<<��@�8&&F'[�_�@.��;�2b�M������������{5��C����p��A.������s�G�^����)��`II���"��UTT4MM4�������Mr��\]]�����{x���f�011����CCC��<��`j���_}��/� �����?���������������������._����h����������K4��r��a�~���#F(
>���W�X1w����dww�%K��?�4
�P���u����Wh����s���v���TAA�m_EC�v4O�8���@s ���Z�E-�"@������\�B.h!���Z�E-�"�r@���\�B.h!���Z�E-�"������s��{���/����������?�,--�:ujbbbxx������Z_	�a�����OGGGO�<y��
�g�>}�4�\�tiDD��M�����-[��J
������i�����'�|r��9\eBB���[���&M��[++4,7i����1c����G��U�������B@@@ZZ�A*4,7W�^}��w���������WY]]mmmMt)�HR	�a����K/}��G��O��q���?��!ZiccC��.�R����f++5>\^^n��������h���#G�$$$������h�2888333444###$$� �:5��Q����L��h.��E�^�������{w�r��	�7o�1I��������\�����}��9s��P���o��L�<966���h^�@�rsq���g����ttt��{�a+4,7L���\�B.h!���Z�E-�"�r@���\�B.h!���Z�E-�"�r@�rs�����T*����t���������7ntrrj}%����"���5k<���.]�i�����e�������@�rs�S\\L���#������[�:::FEEM�4��VVhXz.�X�b����������tZHKK3H%��E�bUU������9�����������H$����\LHH>|���������f]J�R;;;�Tj>|����a�������*�j���V��Qh��5r��a�z�� �����bh������?����k���333CCC322BBBR��SCCq<~�:���������)��*Z�f�J�rI++n�
l�r��x-���\LJJz��w��L�0a������ti�Jh��d�������������2.��n���84j$���6��s1++K3���`����'�����]||�A*���������Sz5��R��2��L�`}�����i���H(p�;�Y���j�bo�~�
������j�`l��
��9::��������Kj�rK5�G���aVayG7��+����H�%��g�T�2�Z���m�V�o����+�yYt.P�R.����^�)���������{��u�u
�t�t������=U��Nl%0�[�6�$����!P�E;�4��r>�����������.�>�tio#2j��=A.�(�$��\��#�9%���fnA(�w�r�~]�\i��n���T����_V���#D�@���	_L���|[��+���G��5[�>��)+�_u���^�����h���`\5u�ky�i�C�iy�E��S�s���U����u4��}h�i{���������S�H�UR���)����[V���B����o���,��D.�j��`04������*K���X����������B�
m�Vl�s�~x:�o���������54/e�����V��Pg0<L�kh)�"�1�je,�X��cY�W���kerw���
g{k�y]|������CW�F_����\�0��b�xB"&���������D)g��P����z��jU���\5`UQy�:
n)�Q*�Bu�*��I�l�[G����S���U0/�"4�RZ��?uG��zAY�L�����;{���]@�!~������m^V�{���jY����6M�A������F�]��>sQT����\���l��;�\���JU�<�:�%wt9��JT/�TKV�5�W�BBj��"���A��N����+�@��/H;�B�������I�k��fl��"@G�_Ru�J�N
Jk�;����`oU�����~��;�=����f����R�����x�Y�����U=�nD��)0(�HtJ%9{5?�x��cW��,�i:��UB?����_��l�d'����H�e"=����(#����i�:T}^P��!w-�1!�9�<p6��a�����
M}���N.����Q0���JA�0��I�r�fR��U�Dz�-i�P��C���Ui�5�Q��#�e!�����OCA.�O9E�;iO�w&]RS��������8��w�s�.����]Ij��4�T�g�B�E�=BZi�G����I=:�,*���`	����RI����~��cWh���Nv��7�����6Gm�*��y��z�	=�Mb��-�C��oS+�	r������I�4i��v�@;��CCh� �nv��<�#d_�N����������*���D���]�r���*���:k��lz���>/���0�]{��|7��zA�|�K���a_������0�"@[�T�S�s�A4IW�);(8rX�q��]Zt��v����������
BQ����� �IM����	���<~5W��uC|]&��PXHD������+n��Q�k��D�oC�{����D����#���
�"::���~������oG�I+KKK�N�����q�F''��WX���4w�r����w���ww/��a!��B�����R��
���fq(9Gj��������xQM�P���C�f�?�k����������w�o�q��QZ�t�����M�6���-[�,&&���E�T�����h���I��
Wp��;88rh����R����������!|{������iOZwGB�b�?�6l���/]\\&�p�			[�nutt�����\����TU��y����,���\������	���s�����gt�&��W�"O@�!� Te!���p.�v�rs����.]=z���o|||���iezz���?-���qk����2
�������{�tx���CC"��t�k����,<������f�M�����aO\P�a��X^^��������'���3��?����������@��z��VV�^va�������s�*3n��<��`3vp0�>00������Sg�qU�j4�^�~���0p����3g������r�6664��R*�����R����4�u*��������s������9U�%���?��.�!A���z�����W\�����6�#
g�uW�Tw�T�U!�F�j�r�����WU�����U_����b�N�**��RJ�R(T�3888333444###$$� �:54��o�7G��vk�������t���|�l���9����!!�.����W��5�_f-��G��7;����Xn.N�8q���4�V�\�]�AM�0a������ti�J#����:q5������:������>S���xDw7�&���,��@*�57m���)���\�?���S}||H��\�`����ccci�.>>� ��%W(�'�o<t���+��=�9(�g��=&�������M�Q� �q�,0��E;;�����T:::�������T��KY�/m>�oA�����n�w8ud�F���A�Na��H,7��si�/n<���|���;y9M���a�.�n%���ghJ����Me�]�z)�����-t%���7��oF��S����G�8���kx�yY1�<D*���-	k.��:���Y���W�Jo�T�����-�\�E�;�]zH����:O9�����6����
�u�P��$�8�����Q��P�*��6����TRP,�+�W�e�$�����Io��k�������y�
��\h��r���)4�>�����N6b�����#{=8��������KY��b?�<���z9M��� �`�5ZNR[�b����K�|��:��+Gu7�(5r3�`��Mi��C+!�]AS������6�z�(I�%R��C�9vQ���'���l6��H+������%��"��+,�������I[�e���������_�����E=Zr����T����DvC�F.,��W6o�Sh����NR#��BUm�\!��U�W�I
+s����rh
U��uZ�+����K�e���m�\m�mE��qp]�\�j������J�����$��B�����^�;����\�S������LRM�L^'Q�Ye���V^M�M�B��>E���y�u��tYu��un���S~�����l<\m}�m��u��. +�z���
���� ���C��TQ��AR��g�b?���dD���:T����U�tY,������z}8u�������$
����&��������������������?Zi�V��A.B�%��;t.c���'��=��oz.=�IS]\��:�����u7�p�������K#�TZp�2�H�EU�\��:]7!����iZemeG3�N�D�6VvN6�.��~��������t�q7M{�M�X�u���n�K���d�����-#�h��������C��a��Cp|;bNF��V�&{�RYSZ\�{�2��*�&_�D���%�|��Fz�P�����D��Z	D"�����������yf/Z[	��@3�F�`���:�2��E�����^����r
�:}��Q �C(���w&}���4�
+t�-,��^zX�E�fqXu�(�
Gwag
i��v0�Y��=��.{�>_Va%-d��U9�u�L��8�[.���v���nv�n��6VN6n�3�y"U��<5>gl�&��V���\�Eh��Jr�j��Si{N^M��-��9��=�m�����
�o@g�P��l`�hj����o�.4d�h�#"c����U��e���KX_�LrCI�����.\��������~�;�x�;��#Y)�#7��Wa)�W(����^��c�n"4r��e�?O_�}�����y�z:7�6�Q:���&bg/'vME�_$w�L�?C��>C��4yb��#��U�H�}6�`J��I����J ��y~�v~4Y���>��-��������R������Y�� �=�+�'Rrv��J{��/���G����e�=��EB������.v}��]lY�?�!��!��v1%�R���{��+�i(���1=_�.%��\��<���e�&i�	[m&����%��BqA��GmC����eV�;�F��=�^\��,������:�,����j��X��2�'�3�(o���S&A��&|[��c���N��p�������
������1OM��]��s����Bs5�D4�W�:�I3OShi�gw��C<���/��������o�wF�\�6�V&��B��=�����gZ>�70�{���q�������[VD�������������+����=�Du]���]��lJ�;�s���)����
��h���BN�*H���nYU9���RY����R��>K�0/�f��}�nO~�~�]����?"�6�3@.B���_�e��������+������`����������Q��q=qh{qz�8>�������N����{t�����M���P���j��gVY?���PQ�i4�X��H*XAZI$�lZ�V��k�`����5�=���*�nf�����\=z�������/���_�Bii���S���7n������J�X�����IO����!%��{������w������a�R��]e�'�,��h�c��`J�/����FDt}l`���L���r���T_)���\��"�&�������c,u��������C��%�bIIIjjj�.]�W.]�4""b��Mqqq��-���i}%X�lM���u��;p�G������Qy	)�q���8������Y��x����.�^���(
�X�_"WXf���B��Zx�	b"{gibkb��*�;'VomK���=�g���N�!�@��E��su���@BB���[���&M��[++���J��_z�J>7
Mz~Y�u�E��>\��u�J�����UG���M?���w�k��`rJ��R�1�;��h���4��2��������p6�vi>I����Hc{bk�
����h={��%b��8e���G�8���~
d��������ii�����+*�&_+��^H������~�BZ;�����ru��u�Y8np�{���o�VF������C�`��q���5q������]�y�����~^n:I�W����+���'��J��N�I�����"
,�����q�f��=h��w�y'::z��m����������K�D}��VV��U��.e����~#9��.s���'4�i�mD���4�� ��4u�cq����a�8�{�w�����Keu$+��3�c}A��~�Y�i7��`�$��*{������m��\\�nWx����������
�6��J�vvv��8|�py��=�p7�;�T��������E��rJk�^h_��X`���<��o�]O;����M9C��+��U^���vI����x���Jk:�J�V
���c�I���AYP�zUz�z���7����t�l=��*�d�#'�Ds��u������EY��9E�v�y<�������p��t��p��t��t��:�;�]XG
�	InM��}3f���{
�M�LF3b��U���*]��c�������_?>��P(�b��
�������!!!���������8~<.���e�H���\�^XU}�A�"��{�[��}�<�y������Uy���a����r�C��E����0��� �����VP�Y�������{D��[u��F��j��^'�HAv����`P/��������1�;n������/|��W��L�b������l�rs�����9s�SO=�t��Q�Fq�&L��ystt4]FFF������&��k7�W:;;�_r�,y;������o0
B�P?W+!����p�:�/�pqHD�^����x�=]�\�Z4�����?J.'��S��/����.w_.�����	�����x{{�������=3�!���~��7hw���M�F�9p������v��i�����?���e����ih�}��#�J;$<��7�\�dInn.�
���!C��������t�~~~s�������egg��1���S4����=����!]���*[6W��,7��]���O/Z����������L�<966������
R	M�+�i�%�i,Ugo�����W�:�p�@��������m������8,�E*��hqH#�!�C�/�M'�� <������I�]�ee�@�:/x���]#�x�D"Qee��s��\��g��^x��Uh�2�u����9s�/U�{��a]�����/������4 i��>}���c45�E��O�>����H��4;�b1�M��"=+s.\�}�v�G���8MSn��7�J���4���=�S����{�n�VB}�����|n�)z���F�����=;��KA�;���?�y"9E��%e;��ao��J#j/���*���?W?|e-��o��R���]T�U]���F�t�!��T}AUw�&��}+���?�����HW���+������_�������4i�,r�)S�p�\�p�+L�>}��Y\����f����T�����?}���=MMu��jkk��y�Fo�B��III����\�,7������z���M������4�� �����E(h��hC
)��e7;��$�$�P3��wK(�8T]h!����b��j��&_-<w:c���3
��ytx�G���7��ZB�;C������Wc��$��=����n�g�;0*www��;����W�;��C���LII��s��111C����]�jU�����>>���$D�M��v%���f�����\�si��lu����4����_�����G�`�^���mZt�y��Vg�4YO��.���-Q]w�P;�C.���_+<�V���vr��	�;;^ZZ��c_W�5:A��-�1�a�a��`���hw��O>�0a����W��7��]�v���3f�����k����K�4�k��������F#���%-�]��\�F�$�����:��`#ztx�>��Y�������u�e�e�O����7�B��<������k�;8^������Uq�y���\<I�0�7�-��#�7o�����9�i^��w���[mm����iV������:t���QQQ���r�z����}600�v7��K�t������6l�������G��,^���O?���48��v��G��_�
v�&
�#+����?�j������tY�?q�*ifp��������d����.4�_���5��O7�!/��I��K�2���R�f!M��Fo#L�CX��#���B�m��1NNN��H{{��������������Ot�����5w��9n�����������Y�f�^#��5k�4s_�U��M������������@�_�'�RQ��p��	w�
R}�TW�OY#��y"����.��"�6�wM�N!
������j�0��o�[�Nn=�J*I�)��H.#�N�s���($��Y���C�6���T������5���{��G��JJ���hd^��>�A{�;v�hb_z���:tH��5� ��-��S=�GO��0�k�^��&�A�,O�s�z�����vCX�~��N��Zc!E;����G�������=�9��`�p�} ;w��\��v������\y�}�N���
	;�u
�IlL�^|;v�C[U������[�1Ko��K��z��������n>�wB�]z�1�����7�6
��q��Q45fw�~Xw������Q����g]C�y=�H96j�;_h������UeM��as���94��M��aU9��k��[�B�ERv���nw��"�BI�a��Pv����!�@[b�������V*��NxgsY�������<����,�
�:���T_ij);S8�e��+��E�T*h�$]�;v�(Yg&R���/]vr�i%�7�O��d\fS������ ,�jt���B�6@=j�~a�5�A.vD2���O�)�e���;������2����J�qv���/BD��j��Ij+���J�:p.�P��F/�����
WkV�e��	.������k���[d��z�'��p�b:���,r�#����g�U<����M5����\���
"��D:�u
���h��d��� L���_�)�B{L��N=A�����F����(�f\��-	B��R�~u�A�{� �)�L��gMB���Os�7��O��'+)k����.�}��l�nn)MD~��!�F&���x&s����7*���;{9��:Y��RPuP4��z[z����?7����A=��������Rh���&���/�#����**I�B6a)Qu?'c��BC���~6������r���k����W����������u�(%��o�Lad*Jn�3W/��������I`7L��v!;����)m���S_}�<�/���z�+R�U���km7e����O��`]�����+X[�����������U��0�:9�;�K'��� l���N�sOU_���S��d��bGQVU�����Q�&O�p�M�p�zj����z%��������J$�IYh&g��s!��1���������3L(��O����9eRN����������o]l�z�A��]y����5��h����w�^s������\�|b����n��H(���������J/��������`������
�<��Y������:�i����aw����]P�"��z�2��mxi�f�5��\�;(J���};��E��|�����,������8�����W������<���z�Q'��M�f�*jJ�e�YH�*kJ��f�K���m�>�|��IB<�����9��fkO��g��x��/e�*555���_~�����e2��U�����x�
�BA���M#�{��������:uZ�z����O+�l��x���������#F:4##�������K�,����9s��#G�����;;;�-�����;w����o�X_vv��3N�:5`��_~�������j������R�\�r%�����\\�n�O<������t���������7ntrrj}e���������2��~�D�0�E����~M�z���i�0)�`R�~�G�=E�|a�[�������s������d������]	�bY��%!�*++��;w���={����O=��@Z^�`�@s�����5j��
�
iii���_��kW���i@�L=}���c�h(j~�������o��F3�&%�N�XL�E������.��};���E����i������uZ����;�������GDDDl��)..n��e111��l����<�B}��������OUT�������u7s��1�uU���b�f(���G����^��7�}/d��$�`����#=�+�i��t��\8J^k���W�	�&�&����?���I�hg�+O�2E�������g�������h�yzzNU���������ij����U[[������4:���&%%�DGG�.#����:��:�j
���;v<��A�������[�:::FEE�O��VV�cY�������u����m|�_5�`%)KP���qi�6��)KM�<x&s��'d
�{��/X��?�w�����,�����l�����G8�u
����n�����M�A{ft�����i��{���C48SRR������/�!C�����Z��vHt�I;y>>>\��5�4P�h	�Jr���(,,l�BM�Z�0�x��y��>z�(m��A�������s�w�}����
W��v���[G������tvn,  ����������=����b6>���:a��Nv���m��s�J����,��QM�T�����{����z���U��)�_����9U����@�:::H����a��i���'�|2a��u��6�2((��o�Y�v����g�����������K�4�k��������F�
�KZ6����0����/�o

������~����t��h�"���]��m��
���B���+������-�R��I++�%��<�<�L*;}H��[��
�UM�&=Kr��N|�3g#������i;�]�Q,��������~9��u�n�
����'=U�F����_:���4G;��{����u���]�z5���������C���??***88X.W�	��$�l`` 
��c��4]�pa||��
�����}�=��e����~���Q������g����DZ��k��A�h���?������o��|���:�6664��R*�����R�����k�S���@���'
6S��{f��4�BB�ka�p��b�ZU�q��	��0K�JdY��������kO�_�:��\�+_> [F��U���mK��J�CJ�CJ��k��o���\o�*08oc��qrr�D������ }��s��y��':w�L{���;w�7n\YYYAAALL��Y�h�����5k��/��i��y{{�������FzG;�(�A};v��� ��~�4&��*�M�1�� �����4ni���I�=��J��G�i(�?��o�\��������=���q�������Z�������6�"��
�&�m�)����}�rp�������2+�O
;:�S�)����m{�r����_���[!�-8����+���{��G��JJ���hd^���`��,i^4�/�|}}i4��-f�\0`��}������w�g����[�lY��i>���?�w��	�7o�������H�T�3g��?������/��W�WU+I����_V�������t�XP�����������t���S��
�^�/W&+�TF�U�	U�e����Q01�����+�O�^]]���_�^3�9s����G��[g��k,X�`������4q���
R���J"��,�a�8�����V�H��8/9)�9���{��n4��*7�;w�z���S"9�UG��������cF�!�I�p���o�l��"�G�t����>l����^8::���[g�VV��2�#�m�*��e';���N�pR]�W����q}���0j3�����w&^����$������o�,�z��,�5�b)8N�A���$h[L�zY���pG�[����l������>A�A��3�~��=��]#�����d��#�_*MV��%��'O��%��"�G�gG�~�����'�3L.^�ti��Y.\4h���l��B�}���O{�cm�<5b����T��f@U���w!q�g�9��u��.mN��,M���<%�Q v]6�����H���,����`�/N�8q���k���e�8R0���S}��h�}�M�J�
������	]I���<�RS]~����i�������!!]�4��`�~����I���[=�����'O������������H�u�z�����,����~;�AV���HU���<+�9�cF���\=�����}��7����z�����Y����O��F��5��E�TJC����J��a�H���������G�����w'Z�T�i�2R�O���lj�������pf]b��S.�*��zZ��L.�0�'�1venOm���au2��%�����������&����'J6��o�+�?C\Z4���������wGb����x�<�f�����������N�B\Z�f,��r���&���n������g��
'�����P���UGI���5�#^��`���:�O���+W�$:��*
��r���l�h�m��~O��2���6��t+��q�L���o�Z���W_��;��0���3��v��_\��iwO���j���W�P����w4��������Mz������qVZu�g��gB��y��	���3��|�l<33���?NJJ0`�f��f2L.�oh�v�~�>}uw��O����W����~sGP�~�py�O�+��F��v����^�zx��
}�X8��rqq�����}�Yww�m���������_SS����b���^{�>\�r��i�������-��^���3f�8u�M�_~��S�N=z��w�bWW�;J(�_l{RsJ&�{�����7�s��	���RN2_&5�l
�	�����M��U+3�.���[9���u{w�������!�<�S$�qcm��5�_�P(�H������>�������S��H$*//��������g���E��5
�q��6,,l�������-����������6��kc��jywKa�W���m������������+�-#����M�������@��==�=1�}+�8�P(^y���^z���?������:�I���~x��F��5��A6N8p ))����[�(�n���U�+��?�~�z!-�����O�pdO�#�7oE��"q��1�C�[����`�O�C=�~b��m6�ilRI��w�������h���\�����A6^TT�����VXXx�7t�E��	,������\y��c�{��RU"�}_���h���l��?�}����`t�I3����+�"���46�4^^^%%%NNN�Y��������U~~��7�
W����t�mk�8j������7���'y����T�N2^ J+[w'�u.�h�������z4����fG,�XS����}���������o�!>>>�=��/_.6�2-��^�G�^�z����?���Q�F�x;�b�p���gW���5np��g�c%yI�
C������������m��m=�!:�g����������a���+00���=!!�9���G�?�<-�X������b���3m�4oo��������/h�_l�
+~w7�G�[��<v�b�K��E��a�����w��vq����~��=l���2h��z���S�S�_����g�����-��^����j����P/�-���n�;�J�d
.���?x��N���}��Ydx�7���o���~���������}�?���oe���E����/_�5kVRRR�n�������������S�&&&���o���;O��JKF��������]�>��w
�U�DZ�#)^�^����Hs�x#������Y��b�z�&���G_�G��X����\|��g�������_~�e�������r����6m���[�lYLLL�+-������w
W^�������Uq��-Q��� ��������}0�{�B��h����m��
�fhO,j�7�����\�<y�����3��S��OHH��u���#��I�&q���J���p�����/������T}�d�a��P6���'��#H������nEr��_��[Q��D�Fi:@�e�������d?���}���U�����������Ri��R��Z���m4�_'�Yd%Y1�xN=����i��i���O��e|��B���v8Z���m���"�����4�������m�Rs��VVZ����	�l����b���lz��P��f���������?p�F�B����f�6W;#��6��s���n����f�:|�0}hccC��.�R�����������h8Qk.�i�\�����B�v"��#���}���)���P��;�7-7����n��T�����W%V,���{�����<!���.�(��������3G,��9����s�gff���fdd����R#""B������������D_�����
im�[Q�C�����I�q�^��~�Y�R*�V�z7�j����-�3qG'���}�������\��c�B�������_z�T�*�0a����i%]FFF���dV|��h����V������X���/W����x��[+\��V�Y��66J����M��\���o�~��>��k����U.X�`�������oo�J�����4i!�����w����H���P����9������������X)xm������r�8�z/z����rs�[�n���:�����w�6l��8z);��E�����>�z�i��bUV�����'n����|S�C�3E��7o���|�6Z��c���
�rs��Q(�s��r?����?0�(k����.�U�mI���YWVHv}�4��LW6�)�������h#6�)�L�����w�=��������m���7���������[�b�k��F�\�r��i��_k�l�zegg��1���S���_:u�tG/��\�?��|�2�@��`�s����H�B"=���	H�*b��9��9�������B�G��>����
�������i�ger�P(�H������>�������S��H$*//��������g���e��5
�q��6,,l�������-�������@.Z�rI��?����M�&A�GJ����|��p����Z~t�e/u(��ND���(4�P(^y���^z���?��������L��>���S�Lil#��i��SHJJ�kFGG�.���\r�"|��H^1;���j���pR����~�y"�x�9���k��g/����8���1��5N{�����K��WxS���D���!��v��i��qk�����m�!��Jv��Nq��g�t�_'_�����`���9Q���r���~�P|,t������^0��^�5G;��&���WIII3�A���i_���*??�����+�FOOO����5���7o���:v��n�O��I�Md����!�2���ngDe����=�~�P����=c6:
����_��7/...<<�9/�����c��1c�/_.6�2-��^�G�^�z����?���Q�F�x;����q*m��T�~>���F���#�s���I�F"t��&r�����C����}�1�S#�:�^�z���7s���>�������+V888(�J��Ml�������6m���wXX��?����p���T'S��jW~rt��]�H�
�s/����������	��
w1{�W�f�y
�vh�MT���)�/�T�����[�����Ws{	�/��k"������S)�E�`o#����$���fOXw#������/�l�r��8
z1�'�o��w�E��Q&�`�����p?�6"Q���	�����V�*����_�#������+�'0b��J��X����\4�7���������O�%�7��p{�
�iZ���g�p�{�S�e���������#5��i��C.�������9��?{�>Q���IP�]�����XV{�������~@B�~�OJm��^���h�/�9_��+X�����#=��l��*��.�����S+����PN��/�������[
�! �`��)_`�
�/��K����p�Il5�J����W�������*�����N�h�t�ES�����v?W~)��n�����=u"^�5�����}b�������
�����h��;l@3!Mm���3
� R7G���JH�U5��~�n&���?�_,�]#d��*�~s�.W{_S��#A.�M��_�q�Og��/�?��S�}S�
�����U�$b��r�����l��tD�����]�5k���'�t��v����0ZYZZ:u���������7r������^������M���������RVk�M��h�Uy	�>��M�5E[��������0Es:�����g�3f�����������=K+�.]�i�����e��������d�������\9��|��	���o���
��A��2[�6r�c7t�`��tP���3f��>}�X,�2e��y�K��n����5i�$.�ZYir�r��{�kRg���[���TO�<F�m�U%��[r��b{�b������|�y�-���Y�fq�}��
:�+�������B@@@ZZ�A*M��?��^e�"�	��x���iQ���x���K���s��<G�B�w��^]0U{:(��ENJJ
�,n���{X]]mmmMt)�HRi���o|�+����f��	���Y�K$�&.=�L�3E��D���#�M��,:��?��W_����������F�R����� ��.//��l�-�4��;�F�� w����*s��:s����������/���4����Gnd�$d�%�4�������������7NS������b�J������
�|#)�E�����{�]������w�Z_����ey�>������8���c�/je���6�����������O>��C�+'L��y����h��]I�T��5��d
ZxuLu���Z����PT���yt�U���'(�>�P0%�����x�R�d����+Who���'O����};��T++�j���='�Gk���n@��h��'������:�QK��U��.�����
��E�B�����q�����4�Z�|��T����*V������-����v����(������+M�L���\l>�z�Jv1-��Yt_���Z����KwU����ns���#�;�1Stg@r�X���>X���hc%��$-����'.�����?:���{��n���&l,�!����*������.���X���v��t�<���kD�U�IB���>(�[����A.����?��La�
�xJ]���X��*���?��!S�=�Gf�`�>�����-���)����}
�R,�oz�(������:kf��el�75��G��Y�=u������E��?x1�R-�76���j(
���-�9�Z|bk�����e8��"uu4G{@�h`U�u�=H���^yN]�9��:�_�<�@���b��2�����d���.���-�t4���J��e�aO��l�"nO�_�&�R�����YY� s����4C[���!]�+]��8-,��lo���'"~��'��#������R���S�[|���}������hH�9X]+��]���I�*�9D�E����l���'{p�$&O>u��h���\4����o�;E�W~?�/�@u��q^�F���/���C}�����>8�Cs��\4�B9��}�07����V��1����aY��/����������<��,M�& 
c�������e��y��K�����T��b�F�t��@A���6�L|f�\4�����:������ec�:wh��x��Y���O��p��������{��F���@.�{?�]T.}������A�������8��~�J���s(s���{Ek�5��\l��7�v�	r�����*�g�M?���c����eV,�����q�V�h���m![����/W(�������=�y�Sy�~��Z%QM�(,���������Xt.n��q��i
�BSSZZ:u�������p����S�+[ck��?O_��2&4KU�W]�/���S;>N~����������-���[�;06���#G�$''+����K�.�����iS\\��e�bbbZ_�b5u����w����c�*����Ms*�rb�?/�9�P�R�����1�)@�`��x��	�[4��W&$$l�����1**j��I\�����V���������q��e�E��5�����m�2\Y_I���W�	��������y�V�������B@@@ZZ�A*[&�������D��������o�()X�1���\UEf��4p��������^������"��H$�l�������JT?v�B��VV��];��G-W7�w���[�0�6��6664��R*�����R�������:�			
��_����W�y����>��9:;@qrk��7Tr���)�����`��X.gff���fdd����R#""B�������u*�J����'�K{���H�u�c�O���?��z�����0�p�'r��rq��	�7o�������H�T������|=m��#���Q����[������/L��Ph�,7y<^�w���&O�K�v����
���#���7�?�**��A�=��;����Ft���$D�:��?�|���s0��E�+9����w�6l��(>�.�K3^Q?��0u����?eB��2��3���6�������J�-�~�?����~�8.#1e��okD,]��M��`�a�&@� ���}�?��#��U����(g���oW���?}�����fn"�r�����VYxp���ay���x�X5��X�[0�_�^fm r�Ydr�[���8�07H����<g�@� ���:$x�����\l��g���+����k����=�����`|���>=1s��@���WT.���_w��D�
%Y�(��$�����=C���:0$���}�������X����D+�����?8�����400��m��v#P�mW�2Z����+�q�c��L���M�C.��W�����dZ�[�����*��=���������iE/�(�+�+HW�b/�����<�y���\lTu��E���w��r��W�\*����������B.6���_��9�Z����\������z�l���nr�Q�\~(���^�KTe��Y-~,����k��F��?���7�U=E[9Q��^n���(0.�b�.o+T�"��t��@�A�nr�Q��n�X3��8}�����<`���) %��.�2�#�o���&�\l���t�\,--�:ujbbbxx��������"�,+�.]�i�����e�������E`Y:V.&$$l�����1**j��I�E���r1==����������8+������i�.%����.//���]L5,@��E�t)�J���t��������8~�xS��"v��2v�qvm��l��Bt�\���

���		1ws��t�\�0a��������222������rq���'O���������7ws��t�\ttt��{��[��c������gG�u|��u��u|�����w�]w���]w�]w��-�\�B.h!���Z�E�6n�8m�4�Ba�=��n���]�5k���'�t��v����0Z9z�����s+����_|������^Ls�L�W��R�l�=����o���M��7���>���6����{k������#G�����9f��	�9{��1c����s����?����gieIIIjj*��m����w/��A����f���6��h��k���w�������4��]��C���5�g
�\����������w�&�[��3�O�.��L�2o�<���%���j�}���S� �����>��C�h�A4��5����k�|�zwm���������~oM�Y�!���@�w�&�[��Y����}�����/������������k�{1�
2W�X1s�L__�&�c��p
�����4��]��Co����?t��[�|�`@�E����"
(%%��*��m�p��q�g�4h�;����7,�{1�[���Z�~��3g�n���}�&{�z��i=����?k�S�E����"
�������_}�U�~���u��q���~�x�t���i�2Q�>|���s��1���doz�i���[�~���E�e��E����?>..�����LJJ����|�B�����oc{1�
2�o�������=��������Co�=���{k��Z�h�Lp���_��'�|����W>��s3g�|����.]:j�(c������������{����wj�����������?kh���K���	.���G�-�n��k��%��+W��?]��]���O/Z����������_J�^Lv����,���&�c��p
�����4��]��Co���	>t��[�|�`@�E=Ly�b{4��"�N\0`���+����� ����9�1����zwm�]��M��7�������i>k0 �"�r@���\�B.h!���Z�E-�"�r@���\�B.h!���Z�Eh�������o^�x���q��q��/���$���r����o���4Js�2 �}�����M��v�����7n|���,��'{n@}��e�5,r�����������e�+V4\G�_����9s��#G�����;;;�d2ooo���^{���r�J���G������:�|JJ����333CBB����#F���q �}JNN���k��s������o���d��7�|���P(,//?w�\jj��={h_����}�4Q:r��]�v�Wm��e����O�6���A.B�TPP��Ml���������=M���0����n��E?���)St^���Es��b��!�f�\�����';;;88�9+�����W���zwww��D
�B�%;v����2d�����U�"""�p03�"�O��
���_i��9+����K�h����

���o��]�~��3fddd���`Y���>-\�p�������=�XEE������-[��]y����|�	}I||��
�����f����������:����(�%���F{`j�Eh����K���/~��g�������0����Y�f�^c``��5k�������{����>�`��9O<�D���i�������*
�5�-j
;v�hl��e�(����?%%��m�C.h!���Z�E-�"�r@���\�B.h!���Z�E-�"�r@���\�B.h!���Z�E-�"���d����ZIEND�B`�
#27Andres Freund
andres@anarazel.de
In reply to: Jesper Pedersen (#26)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On November 6, 2015 9:31:37 PM GMT+01:00, Jesper Pedersen <jesper.pedersen@redhat.com> wrote:

I have been testing this on a smaller system than yours - 2 socket
Intel(R) Xeon(R) CPU E5-2683 v3 w/ 2 x RAID10 SSD disks (data + xlog),
so focused on a smaller number of clients.

Thanks for running tests!

While I saw an improvement for the 'synchronous_commit = on' case -
there is a small regression for 'off', using -M prepared + Unix Domain
Socket. If that is something that should be considered right now.

What tests where you running, in which order? I presume it's a read/write pgbench? What scale, shared buffers?

I right now can't see any reason sc on/off should be relevant for the patch. Could it be an artifact of the order you ran tests in?

Did you initdb between tests? Pgbench -i? Restart the database?

Andres

--- 
Please excuse brevity and formatting - I am writing this on my mobile phone.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28Jesper Pedersen
jesper.pedersen@redhat.com
In reply to: Andres Freund (#27)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 11/06/2015 03:38 PM, Andres Freund wrote:

While I saw an improvement for the 'synchronous_commit = on' case -
there is a small regression for 'off', using -M prepared + Unix Domain
Socket. If that is something that should be considered right now.

What tests where you running, in which order? I presume it's a read/write pgbench? What scale, shared buffers?

Scale is 3000, and shared buffer is 64Gb, effective is 160Gb.

Order was master/off -> master/on -> pinunpin/off -> pinunpin/on.

I right now can't see any reason sc on/off should be relevant for the patch. Could it be an artifact of the order you ran tests in?

I was puzzled too, hence the post.

Did you initdb between tests? Pgbench -i? Restart the database?

I didn't initdb / pgbench -i between the tests, so that it is likely it.

I'll redo.

Best regards,
Jesper

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29Jesper Pedersen
jesper.pedersen@redhat.com
In reply to: Jesper Pedersen (#28)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 11/06/2015 03:47 PM, Jesper Pedersen wrote:

Did you initdb between tests? Pgbench -i? Restart the database?

I didn't initdb / pgbench -i between the tests, so that it is likely it.

Each graph has a full initdb + pgbench -i cycle now.

I know, I have a brown paper bag somewhere.

Best regards,
Jesper

Attachments:

pinunpin_v2.pngimage/png; name=pinunpin_v2.pngDownload
#30Andres Freund
andres@anarazel.de
In reply to: Jesper Pedersen (#29)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-11-09 11:54:59 -0500, Jesper Pedersen wrote:

Hi,

On 11/06/2015 03:47 PM, Jesper Pedersen wrote:

Did you initdb between tests? Pgbench -i? Restart the database?

I didn't initdb / pgbench -i between the tests, so that it is likely it.

Each graph has a full initdb + pgbench -i cycle now.

That looks about as we'd expect: the lock-free pinning doesn't matter
and ssynchronous commit is beneficial. I think our bottlenecks in write
workloads are sufficiently elsewhere that it's unlikely that buffer pins
make a lot of difference.

You could try a readonly pgbench workload (i.e. -S), to see whether a
difference is visible there. For a pgbench -S workload it's more likely
that you only see significant contention on larger machines. If you've a
workload that touches more cached buffers, it'd be visible earlier.

I know, I have a brown paper bag somewhere.

Why? This looks as expected, and the issues from the previous run were
easy to make mistakes?

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#31Jesper Pedersen
jesper.pedersen@redhat.com
In reply to: Andres Freund (#30)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 11/09/2015 05:10 PM, Andres Freund wrote:

Each graph has a full initdb + pgbench -i cycle now.

That looks about as we'd expect: the lock-free pinning doesn't matter
and ssynchronous commit is beneficial. I think our bottlenecks in write
workloads are sufficiently elsewhere that it's unlikely that buffer pins
make a lot of difference.

Using

https://commitfest.postgresql.org/7/373/

shows that the CLog queue is max'ed out on the number of client connections.

You could try a readonly pgbench workload (i.e. -S), to see whether a
difference is visible there. For a pgbench -S workload it's more likely
that you only see significant contention on larger machines. If you've a
workload that touches more cached buffers, it'd be visible earlier.

Yeah, basically no difference between the 4 -S runs on this setup.

I know, I have a brown paper bag somewhere.

Why? This looks as expected, and the issues from the previous run were
easy to make mistakes?

I should have known to do the full cycle of initdb / pgbench -i in the
first place.

Best regards,
Jesper

Attachments:

pinunpin_readonly.pngimage/png; name=pinunpin_readonly.pngDownload
�PNG


IHDR]T�)	pHYs��:�@IDATx���	Xg����pETT@E����z`��mAWZ�b�\bQ�j��������k�b�G-U����b+(�mM���U[��R�� xp		���&�&,F�H2	|?O���_&3�$�o��d��R�(����3�\�A.� t��:�E�"�r@���\�A.� t��:�E�"�r@���\�A.� �����7�B���C��������r�T*+ce�]r����T*�y�fjj��E�>������+�$�"@����A��}�]&�@.t����mll��vfCC���%���s��>���s�}��gnnn����L��G��z����w���=���s�{�^�l��3��^h�"@UUU�={6!!��O>iq���Uuuutt�������_�d�%K����egg���VE����{���I�yyy]�pa��)={���_����@s�E�vh~�����SO=E����_�Nc|�s����������V�T*��|}}I�����qbbb�\��z@���:�e��q������;w:����^�3fLQQ���"�����i�-~���G���C�&c���
��E�����}Nm4VUU����X}��������x��]2en���������/�������i��ts�ES������/�|�M++�~��������C��r?��C{{�#F����/^�j�	z�� L���>�5kI5�B��3�l��m��qJ�������5k���7���3NNN+V�4M���"@[=�`��3�?��2h���6�������"!J�����z�-��:�E�"�r�2�kB�@.� t��:�E�"�r@���\�A.� t��:�E�"�r@��sq���S�N�^a���_"�0���go���4*++#""���}}}���:_����s����-[�W***���������$�322RRRV�\�����"tOf�����5j��#G�2�sttl1[NNNVV�H$
			

e���E���:��m�������
�����c���1b��mnnn�X\\���B����.]b��d�'�������={<�y1   66v�����-������&E�LfmmMdZWW����"tO������y�Z���i,]�����i�B�md*�Jmmm
R��H$�n����%00��.@��o.~���i�7nAxyyq8�R)�{I@���zzz���xxx������������oh���v���7��� ��m��1#::z��iIII��
������#��� ��{2�\�+55u���			c����u+S���KNN��iii)@�d��,����N�j1�H$���5l�'�E�A.� t��:�E�"�r@���\�A.� t��:�E�(j(��RJ)E5��SWTr�kOQ�+�h>���h��f���r@�������e
�F�����T��=*A�=�BU'�4����Ri���ryt��.�'��49�<>i���(n����m�Y)�|9���S��\���Wq��-V}�����Z��R*xJ%W�P4W����V*���	i��o��qx��.�����5e*	T�\[����V=�o���sq���S�N�^j���2"""??���7==����HE�n�+��������1rhM�&m#WZ�
-��)2��r>�����4���9�&M�(�����z%%SQR��&W/�AI5j�`k5�6*�������M&�)��r����cV��W�r)�����\��m����!
�T��c����.u�"u!�&u>��P��*�B%T��{�a��J���84��:���VV���j�9�|�5�Vu�����������:k��{i������v�u0=��E�[�li^IJJ��)))+W�LLL4R����;��9�L�����sw��I�X���k��27T���$�d*�NM|�X��W��)M��L^��������m2h��G	9���Fk�������6���ff�h��R��W@�o��rx���i��qZ~����f��u�W�/�y(m�Y��)&�������^�����'_)�T
�JA)�����U�JF)����)�S���W�UJ�<�����tT��������
�C0�������5���#�JNNNVV�H$
			

e2�E�H"�JM�g�K�����g5��P
>����v���?\�"���j3���<����;u
�~��Q��:��?��J�����!��&��v<�i�
3t�gnu|LV���d�fZ�R?/
MC[������zOO����D���\�nJ����OY�L���U�:�RR�fo��4T���?2�����>�`jf��d��s����dm�������4\]]/]�d�"tm$�7n[���g��L_�G���j�i&N�<+��qx��B�-����B+2^�Z���\2���&y���q�{5�F}/e��������i����m���r��km�U�l57�<�W���QQ/k��6��k��5�������L^[�P�i�����
����]W�P�;�@U��l:�|v�R���_M�����LF��9����gn��<����$�v�Xv���J�����^�[�u
����#7��]��[=i�j�2u�������4��U�����E���yyy={�<xp��L&��V���i]]����U��[������m�]���*���o�$��>��������7>W@���v��cI��������F��R���An2�)ml�U�&{���������~����P�*.e��q��i|L�w��+OYj���~5w�U��0�\���������(
I���T*���5^QK"��z�bNN��6L��N^|���~����\I]��{���#6OzZ�����~�����i8�r�)!�#��-�a���2�>Qlt�jP��U�
��{
�F�T*��6���k�T�V��W�wF*(����h�aLG�����:P�
�m��K+h��K)�P��9|�J�P���#�+��r���g�����o.~���i�7nA������zzz���xxx0��������BB10��[��w��r��fW�����V���KD>���#!A^���sa���^���\���\_M���A���
U��p������e���f��@�����N��VB��W`������8|+�ce�jv��m�D<._��������
H����B���Lb�o�1��0mA�7��� ��mgff����iPP����5\�]st�G��{zz7&�TV���r�~��>9���7{�"�F�S����
t9���z������%''�����4����D�m��#���pSYsUw��z��3&>����k�se��,"�(77���(��*�Uqa��G]~��T,)WJ���������4����"@]�q�$g�@��)�yK����C'�c�S^��k~ � �E�
��()�.�������
e�����o�?��w�2���_;<�)���������Y
�����%U��
��2b��A/�t{N
�r,U��_�'������R�v����CF,�v���?�h�"X�[�������^��\������>�CF��x?_����Dy��4���R���5���u�HF�Ox��y��y���@�\K���(��<���6�\y�N���3��W���9>�b��@.�ySTVm���q�V�}]u]������!!$�E�[Y@� �\5��+Z����we������.����/����d7���@��\sT��
���+����jvy+�?������t:��;����`vj.m9Y�9�L^���5BJ�����8��5����`^T�K��}�M��
]sO�m�e���y����uY��A.�9Q5�������F��A%T�#�}�y��L�f��������|�h����r��M_�;������bbb


��i��Q�F�����D"af�={��
H���2"""??���7==�����E0=��}�*��R�;�)j�)E0=���7�x#$$��o���cGll��3gH������h��!��LJJ��)))+W�LLL�|LM^~����5i��S�H�v{��Y�bXX������?>S$c;GG�s���dee�D"�����L�u�����sHJY#s}(g�����lw	�)��E�d*���n����/2E�������1b��m����H�������4\]]/]�����"���S��T����SQ��$�~�����tS���2^0`@^^s3   66v�����-������&E�LfmmMdZWW����"�L����ww��t���G.�y��E���������111?��#��s�N��t�Rwww�-
I���T*���5HQK"��z�bNN���;�s�C��n-i�Zq`�P��^9W��k�7W�Z5g��@�p�B�XPP�����p�J%��)��,--���,))���0HQ����E��b``�16����Kk���kb�Qa�al6~�]>L[�������������c�c�=�g��Abr��iIII��
���$s�iPP�A�`w��>������FqU��W��P��o.~�����O_�|�#�<B�L155����u�V����,����RcS��q�����l��:y���O��%��sq������-�����N�jQ�D����-�q������M�
�`�G�C'?���.h�o.Bv#?��w�k�����jQd�>�
���`jU��u��O���cEE���f�u/�;��&�j�v�J�W���V�������� ��T���'}q�Q��^���7m��l�	� �tn_�Qu�v��m��.����_ ��+���^�����DE����xvuo;v�p?�"�����3w^m�ZqLO���^e�Gz!�Te?L�x��A���o}�v��C.������`��W��=�|���=���N��\��U���b���ij�W�[�a�v	��E0&���o�(U0�z�4r�=hr��,/j���ZM,����L��v���r�p�O
'/�k:����w�
���C �(��\��"���{;��v�����.\���)((:t��M�F�E�������������F*B���7�'�\�3���.^�r���|s��7�		���ow��{��RLJJ��)))+W�LLL4R:�������V6��B%o�k���v����s1,,���GEE��?�)���dee�D"����L��Vq%�t���J�X����~)�6�lw
���7I��\.��u��/�����]\�'�tuu�t�����1����.��]�����9h��v��.���2^0`@^^sS&�Y[[�������*���r���^��r��_���#�v2�\lllLMM�������M�PHb�L�R���-3�1�Z���[srr����w}�I~�
Y���F;��&�[�;�>����V��3g�@ ���^�p!Stww/--���,))���0^Q����E��b``�16��������~��t�����n��>L[������+�����;v<��cL188833��4((�xEhe���_�����=��B,�����_~9}�����?��#������������bqZZ����.��_�G�L����*���-l����7������(�rssMP���vj}���K��=�\�NP��o�v�:�|s,�����k�h�a��������n�:���j,����7����i=l�3�X�@� ���|?mGUu��|ov*�wB�h�)n,r:�faz���s�M�{����B'�;�Y�E��F���7��,k�Zq�[��.���#0w��������L�\�t��u����;�L
r�2�"t���A)w�
���.��S_����!.\�`�e���~������w��5�2���n�?,�����W��@���������^}}�������`�22#2�L����*�J����$�|���d�VTT4p�����O�0�����d�Z�����3fLII	M������+�_�}�����G����G�d��7o^RR���}eeeQQQ'O�$)�c���G}���M2uttT�T��j�"���������6���x��>�n�v	�������{�O��x���C��|��i���$���x&����������{�n2sU���z����O=�	H������/��BBQ�[$�����_��$II�S �u���s�����x�b�}���5&$$����4e�|���\�vP��+������H�qzq�����@������-�a����]�4H��a��E����'M�J�L;<<�����sLc��)111L�o��$�����q�*$	y���IMu��f��)
�126-(( ���c����\�v���+����5������K_��#0:''���d�H��z�b�d`�����G�����������'&&�=��������-�Y]]��_?�MF��:	�VzB��LgH7n����M��mu��G��%u�=<��+i�
������'�|L|d$����_������+**������H��?^;�YH+����7�F2%S�6�V4�\�6��u����o��>!Fx'���]���}�E���]��
:thCC����IVyxx�3f��!!!���
E��v@�uss#���'�4]�xqZZ����%I[����O��d����W�@CA.�����a�7�����z������	o���wpp D2�������]�|��9s�N�:h� 2jd�<p�@@@@UUUyyybbbLL5����qc�E�������}�v#mr����;j����m��z��8�@7�� O��#Fh8H�s|�iL�0������Hd^�x�i������[Y�^��������(C�J�s.^�|�|�8q���!C��
��%��k���g���aiTVVFDD�����������-�/�V����R���F�7���y/m��d�S�b��K��d��k���3g�:u�+**���HR6�3))I,gdd����\���;_F]uqU������V|$��~��v	�<�G�pD�:������)S��������3E2�sttl1gNNNVV�H$
			

e���E�P�}�rO���5
v)�]���d���������3f�&�Hb���c#F���m����L+���...������j��E ��,�Fy�n��-T�������lw
���7���d���������9r��e������L&���&
2�����E��+�!�y�N9����V5�������)�3�\<{�lPP�_|����Tv���4�.]������B!�62�J����)jI$RoQ���1����{��+�`�n��Q����T����������]LII!cDm����d$��Q*��S��,--���,))���0HQ�����P$3�������%�����n��x�v�
p�.@�u��]�����E�^���_~�yq�������MKJJ�&Vpppfff\\����A���J)��������f��E�>�7���T*��+��/^$��������'$$�;v����]���aaa���b��<� ����o����1����\�=�v�L�|sQ{�������26'�rss
[�����Y�9PtO��\���
���?~�w�}�v/�|sXq������O$���x���@�q�����&�E��t��u�<�=(��q�������������?_�`�\._�n�L&{��w�J%iGF��d�������[TT4p�����O�0�����d����R��q���3��������{o����_���>z����������G����7/))�A?�+++���:y������;�I����Y�p�B�J�v�Z�W��\�&
���N�O��N3�����^5�++�{���>}�����z��7�M�F"�������3gNrr����������Q�z���>��S$ I��������	E��lH�
>����&I��d�@  �"�{����g���>>>���#kLHHHKK#=���>s�L�^ur�\80}gm#�7���c#<���s�����pI��dd��h�"{{�I�&�����"�o���sLc��)�������d^�>}"4��A"!���#��\�����a���B�?���iAA�@\\22�jll$y��W��\���_/��j�V�f��{���������W�^L[{hd^^	�����;?������8z������[�N,�X&�����ik�Ni�����$�����o��C�l��ar����dp}��1��o��������}����7���Y����|]�������p�7h#2���O�������������������]�vEEE����x�����k���{%
���{�h$�P2%mnEs�����z�<5$kkk�O����5d����2�%Ci����������zM,��&�,d�G�0O<�����a��
:���a���$����<<<���`����www�B��I�I�^777'N$i�x�������wk/��:��%K��^������ar���Sd�H9r$;������i������U�
�T�A%�����%�����V$������@�d����EH��/�3g���S
DF������*//OLL���!�F�7nl����"##���}||�o�n�-2�; ���������T����f���;er���8��~���0+�/���=b���w�2m2����Hc��	���-E"����L��,I^��.���������v�ar�������NNN{��%H���q�����
f����i���5;cl(n��������k���2e�L&�������T~���c��i/fIUs1�����m"F������p��A��ma�\$��Cf��� #9w0�+Y�B��6���x�=P3Y��#,�����[�������8�V�6����)�&��?s����#Gn������ ����]D����(���g�?f�`�_|��W%���I���F']�|�d��'������(���2"""??���7==����H������c�W�k�m�*���l��\&Iz�������P|���
��������8p`��]3g�d.����$�322RRRV�\���h�b�P��n���V��6��[%����%�a�\�J�$I�Lkkk
�����)S��������3������,�H�d�1�]��}3����/�z<��d�;`N����y�><f��]\\���B�����L�T��������pq�:�=����#�b�\l�s���Nn[XXH�����M�LfmmMd�=w�1�]Uy~����=����E�X�X��WR4������K��[�n���d���v�������g�}��^^^LE(�#S�Tjkkk���D"!����cl���o���sjO��u���!�.���� /--����

������k#���1>;\�v-000%%% @w�\www�����%%%�+j��v�%�c��X�����~N����i��w�n���JF]={���y�o���������OP�}����0`��5k.\Hn�]�622���Dhe��-\��������'O���c���}�QJs�R2uttlWB�����-z���_~������������82%CI������]�����t�P�lu'�K5�������������9s�����~�����8����������[TTt������E��9
�p�<���g��}���			iii2������7����Y�bs����d0G����0��b�����e�bs��/�~�qC��aMqb6��#�J���������5k�g�}��.9���$����&M
�B4�AN|���dN2�a���a��n����U�|�_�D���&(v15�������}jQO����,H�����g��dw��NNN�fx��M��sd�w��a���������Aa�\d�4����|���n��>.�^��b�G�i6#�{;��$M��}+**�x:0�C�z|>����_�^$\I4����L������
QU}���g���HF�o��Cs��X��7m�4����__��<�_�~�5�W�Z����2X�^������_�d�����?X����]\���4E����9�����=���<���nnnNNNm<����>�9s&i�Y�����9��P�+11122������g���^����T���������������!o��j�(�h4oGk0����V��������^^��=���x��R(��W|{����h�D[+�]0��.�`��,�T��������a��`����5�K�b�t�F�]�}E�������������<�Y���4��!��G"2���N>�G����^��.�T���6e����;��}n9�=����F�P��Y�����
{a��'d�K�������t�9�
�e������r�,
r�K)�x�j��4�P������A.v)U�Fg������l'����� ���9�������������a�G�5�Z��Yx[�u.���GFF6������D"a��g���aiTVVFDD��������0'e�d��TVW��
�U6�|�_�p9f����2�5���u=z���3-��������!C�4/&%%����������+W&&&v�hq.fO�M���o�g�9�f�?�����={n����7�prr���~��'�������f�����k��%#�|������UVVu��Ioo�;v8�]o�|s����$�Hh5/�����c�9srr���D"QHHHhh(l�,Z���\p<{[s�3[�=~�=�*�y��������R�������������~�����8����������K�+����P����P'�c}||����������������`��8����$����;6b��m������bqq���i���^�t����E�P�.�:X���9r�2�u/v�]�R�|������g����g�5���I&�{'M�D�����i����}AA�3..�����7�
���9r��e����q7)�d2kkk� S������Er�����k������8|:�=�.�w��d��gO2��^'''J3�k~,�^z�4���������W���ow�o�YX.����i,]�����i�B�md*�Jmmm
R��H$������gVMmu�]�7��������v ��C�nX�Q����F��o��m<,��������y����	W�}��!S&h;��r��������N�_ 0E�������%%%)j������P4������R��>Mp���&������t{�4�i�������������!���������W�Z����2X�^������_�d������n/��3fDGGO�6-))I��������qqqdd��E8�������}�U��d�{]���?��������t����f��Ik�����W�Tz����������������}��/�a���}�s�ojj�������u�Vf���������d�X���f����54���|_�4X�y�k~����D�{_�hQl�h���`�QQQ�������������{����`���w3���O�:��(�rss
[4'3f���3���>!,w�K0�\�V\�t����+��m���K[������k��� -R��S�R5�A}eXL_�����*�:��iN�\�<Gs6��)�P��pm�}�e�G]r��T���(��]���i�*�k�n�����9���	����8��bv��� -������9q�\=X���R��@W�\�
�������	�^����]�z�������
���m7~��^o��#�.�hn��M����-�T��fl��\�;�!-����t��i�8`�'/v��U!-�d��K�+k�m{���Z�;�u!���������b�R@S�**�7��J�v��,��Y�[SK�>9�N^���
(O�Qc����)���h�J������Ik5_,�7ZGLH��^�:��s1===22R�Tj+����������^#�����O;]��9������h��v��8����G��9s��������bqFFFJJ���+�Td���eT��{�f���>����v���o.?~�D	��������,�H�d�1�����?8��_�j(�������%�n�|sq�������]\\H������K�+��FZ_s"��
f��i30��u,�	��0�\�K&�Y[[���������:�+0��^��X�����d�8|v{�}XX.
�Bcd*�Jmmm�W��H$������c�
��������R��(���<|�H���YX.������zzz���xxx�������BB100�[w��}E�HN���5�}?~vX�1V&f��`p��������qqqdd���]-+�����N��6n�E�3�\�i�y���F|||XXXrr�X,NKKcf0F��d
�?L�� g����������ts���-~���D���&(�����X��k��N�!�p)V�o.v?�k��g����4��^���NtS�E��=!��j���M�5�7�G��@��\d��[�7J��[�t�M���>���v	��C.�F�P���������pP��k�� Y��}r:��A3V���^��A����)�����c�V}o���{����^����g�S�\d���G��7�k:���������<;�
���UVU]�*�n:�f�k�����h!MJ��Ne?�[���|�zP�x\C
�� M*oGx}�9�[����oh�f�S��\4�_s?��+k�m[�����g�v�� M���S��M.�U97v�s�!lw
ZB.�B]]���&�x��X���OyNf�K�r�~��l:��i���N��6f��r���_"�0���go���4*++#""���}}}���:_4�#�^�C�+4������a�v�`@��EEEC���7sIIIb�8###%%e�������/��#�q����m{�^��Oh���aYX.�����c�bNNNVV�H$
			

e���E��~�|���H�CEM�=�������`y�~���#Fl���������]\\H������K���,v^Cc����~�S17C��<9���;���������#G.[�,...;;�e2���5i�i]]3g'��'I{.[���GY
|u�zC-���rq���Lc������L[(�l#S�Tjkkk���D"!�������Ye���J�X���?����s`a�XPP�����p�J�@ `�$ KKK===KJJ<<<R����kQ!	�J'����Z}�N�n���	��8�vj��������\�1cFtt��i���������G�AAA)v����sK��*k:�f��5Eba����:}������c�n���)������%''�����4�;F�P�J��*k:��_�N����Y ���������S�ZE"Qnn�a�s`�s��L���6��
Y,�����9����,U)���`��<v��\4�?N|�%�W�P�9�;���8|�;��\4����_����+
h�M��=E8�
�EB.v�J�����g���k3�W��^e�K�a�����)>�(c��q]&�n�3����C��,E13T��z���Y�tr��.���UMf����}��� 	����)��b�����{\����;|eO�A,�	:
��A;����Q=T�)�U�KO>�v������������l<��<y��CA.�[���\�"���?��|v��\l�����Y�Cc����G������X����>{N��+W7l8����r�v������>6���_��+'����g*��C.�����#~_3Z��K�q
)�.�HUVVFDD���������;88�>?��B(tU�E*))I,gdd����\�211��k��TNNNVV�H$
			

E.tg�E�������4\]]/]��vw�M�EJ&�Y[[�������W"�H��E2�4Q������P($�H�$�lmm[������BB100�T��z�^������+��A.R���������%%%lw��\����333����4((������T|||XXXrr�X,NKKc�;�&�"%�rss����b�=��cX/���b��^��b����c�X/���Z�z���:�E�"�r@��������J������R�p������'N2$55�������%	3����7l0��'����K�t��*��A�1�6������k����������&x��>��y}����mu���3g�0���Z�	.;~�����k����N�"������"�O������
l����q��#GZ��A��e5���]�	^k��5�k�w�&x��>�&x}����mu��q��K��Y\�	.5e��@>�|�H>�:::|]��]��.�y��]��D�L[��A��e5���]�	^k��5�k���V���z�R��`X��������:\*2&&�i>|x��1L��w����1b��m�����^��0��1��Y���V:cmY
��z�k��Z�zM�Z�����������/r���~�H*,,$�,������������#G.[�,..N[7 ��0�����������~k�3��w3�Z�i^�O)��/tr���~�HC9{�lPP�_|����Tv���4�.]j��v�]�i�7''�������G��1�����L�Z������B� -�	.y���������9Z[,(( ��9�R��X��U��������<y�C;c<z7��1��������B�!-�	.�h���_���_n^�1cFtt��i���������^��0��1�;�����������km�~��>�l���a�������i�zWg�KE���u�X���y��E�a655u���			c����u�1��w��4���W�Ga���h��j�m��^��z�k���A�Z��Z�}JM���a!��d�\leu&�T���x{{3��2��0��1������h��j�m��^��z�k���A�Z��Z�}JM���a!t��:�E�"�r@���\�A.� t��:�E�"�r@���\�A.B�u�����{����D�V������V-s}>m��^x��#G���`���5���O������AAA�n�Z�zu|||��w�Z�.\0X�,!�kJNN���_{�5�vqqY�f���h����_���>z����������G�\���L�p�B2���kI����_�v�<�������K�,)--���������g�-�@.B�t������6�<o����z����^�b�{��Gr���UWW�>}������Cd�Ir�����(}��<H�g��������0�"tM������m!�H������$���SlllLHH����4iRxxx�������(YE��!��B.B���_���2ww���L���{�f��@[wrr"S+++�R��!���OLL=zt�����['�
�q`r��g�y�_����ef2�;�<�}�����/SSSw��URR����yA.B��x����{������^�����e��'����w��'~��'�!iii�w��H$Zl�^�n�����<f������!�B�0�v��!�kz��'�xq��%���Z9'111&&�����6n���b?���a��UVV._�|��9S�N4h5a��E����q]��Em�w������l����l&L�PXXh�>���:�E�"�r@���\�A.� t��:�E�"�r@���\�A.� t��:�E�"�r@�������Z���IEND�B`�
#32Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#25)
5 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi, Andres!

I'd like to share testing results on IBM E880
​ server​
.
​ At first, I'd like to mentioned that we did experiments with atomic
increment on power8, despite it doesn't have native atomic increment. Let
me explain why. CAS operation in power8 assembly looks like this.

.L1: lwarx 9,0,5
​ ​
cmpw 0,9,3
​ ​
bne- 0,.L2
​ ​
stwcx. 4,0,5
​ ​
bne- 0,.L1
.L2: isync

​So, it's a loop until value isn't changed between lwarx and stwcx. When
you put CAS inside loop it becomes loop inside loop. Atomic increment is a
single loop like CAS.

.L1: lwarx 9,0,5
​ ​
add 9,9,3
​ ​
stwcx. 9,0,5
​ ​
bne- 0,.L1
​ ​
isync

​This is why atomic increment *could be* cheaper than loop over CAS and, it
worth having experiments. ​Another idea is that we can put arbitrary logic
between lwarx and stwcx. Thus, we can implement PinBuffer using single loop
of lwarx and stwcx which could be better than loop of CAS.

​Tested patches are following:
1) pinunpin-cas.patch – moves pin/unpin buffer to atomic operations with
buffer state. PinBuffer uses loop of CAS. Same as upthread, but rebased
with current master.
2) pinunpin-increment.patch (based on pinunpin-cas.patch) – PinBuffer
changes state using atomic increment operation instead of loop of CAS. Both
refcount and usagecount are incremented at once. There is not enough bits
in 32 bit state to guarantee that usagecount doesn't overflow. I moved
usagecount to higher bits of state, thus overflow of usagecount doesn't
affect other bits. Therefore, this patch doesn't pretend to be correct.
However, we can use it to check if we should try moving this direction.
3) lwlock-increment.patch – LWLockAttemptLock change state using atomic
increment operation instead of loop of CAS. This patch does it for
LWLockAttemptLock like pinunpin-increment.patch does for PinBuffer.
Actually, this patch is not directly related to buffer manager. However,
it's nice to test loop of CAS vs atomic increment in different places.
4) ​pinunpin-ibm-asm.patch (based on pinunpin-cas.patch) – assembly
optimizations of PinBuffer and LWLockAttemptLock which makes both of them
use single loop of lwarx and stwcx.

​The following versions were compared in our benchmark.
​1) 9.4
2) 9.5
3) master
4) pinunpin-cas – ​pinunpin-cas.patch
5) pinunpin-increment – pinunpin-cas.patch + pinunpin-increment.patch
​6​
) pinunpin-cas-lwlock-increment – pinunpin-cas.patch +
lwlock-increment.patch
7​
) pinunpin-cas-lwlock-increment – pinunpin-cas.patch +
pinunpin-increment.patch
​ + ​
lwlock-increment.patch
​8) ​pinunpin-lwlock-asm – pinunpin-cas.patch + ​pinunpin-ibm-asm.patch

​See results in ibm-scalability.png. We can see that there is almost no
effect of pinunpin-increment.patch. So, atomic increment in PinBuffer
doesn't give benefit on power8. I will publish test results on Intel
platform a bit later, but I can say there is no much effect too. So, I
think, we can throw this idea away.
However, effect of lwlock-increment.patch is huge. It gives almost same
effect as assembly optimizations. I think it worth having separate thread
for discussing lwlock optimization. It's likely we can do something much
better than current lwlock-increment.patch does.
For this thread, I think we can focus on pinunpin-cas.patch without
thinking about atomic increment.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

ibm-scalability.pngimage/png; name=ibm-scalability.pngDownload
�PNG


IHDR����sBIT|d�	pHYs��~� IDATx���w������mW�Ho�TD�(*�k�I,hbb�Dc����bI���.�E�DACU)K���������{gwvv��{�����~>���n��3s��=��s�cPEQEQEQ�i�(��(��(��(JSBeEQEQEQE�����(��(��(���PAYQEQEQEQ|���(��(��(��(>TPVEQEQEQ*(+
����HTDF%�OD�\y��@Dv��ry^D~ "�@�Y�\��PDV��$9(�<^�5����.�kH�Z�D�yDD�k-��E�^�ZC����j9�"���l�"Y'"����
�M���M��F9UD�("�����yq�$"/��&W�����"�UC����\�%"�D�}9��s�����XD����5�C���>�������>����
]����W/���HD�s�ow2���FD��Ed��l����""'��7UD~+"�s���"����y�����LY����"����("��H��|�������^�\�{�(
���
�����������D�-�����D�7"�RC�:�I-���.���0n���������w_�<�|�s`X�!>�������������!���:����G��e��wp8�+�Rk��[�1&�_��'���B�c`����O&t����X�3@[���1q���4�s\�8MDFcv�d}���
d'�D�fc��B��
�[O����g�����Eq]mrhj���i���#�H�@�4�0�+�?@���
<�����?-I�~��c�e[������E�Nc��!y��7�1�f�{�����1Sb���w�-�����N����&"Gc�	9�2���������11�)J�!"��}�U���1��k�D\��q*����CD.&���l[s7p���f�)��s���0���[�7 +t��3}{`���p@H����@^ |��wI �
����c�����F�G
e���K�=����@x&VX�b?�!y�T.�.��@�@\0���N����z�����[�8`�+��Z���\�������`H����@W_�!��|�o��{4~+��Z��4)��[[[UK~�[������oy���]�m�}o�|R�O�3��w&P������\���>��1@	V���������C��\�������<$_6����wc�_�Z�������6vy�^]���n?7F�^@�������\�����S��7U�V��@w�����1��	�1��1����9���iW�����n4�1Oc�4�,	�?���
q>9;#�8�SE��Sj���!����1����{�1�;��v9Vk�m��5�?�~�~*"��]���g����o��@D�0���V`|�����Yv�e���W���T�cL�1�Hc�+&0;c�y;�v��7}vF�*c�_���c��k��wq��1����!��f��;8�����t2�R�:"�%0�l0�|c���'
�k^����Z�_��y��90�l~���V|&�um�Z**(+{���}	�T1�|���1��m���l)�	�_c����{�;�$cLA3&"r��<+"����nu6rO������"�Kg��M*���������E�o"����,�5"2ED����:{�x�E�1���������#�<�1<�k�z(P�;�����X�f<�+�^y��k�q��,"w��"gW�/�Nw���SC��"2��M��q~��	��-�X���Ri�{�/]��+���OD�'5����<,���PD�������`�H��L���M�/����W�5��������j���J��"r��������8��������,��B��&>;g��{��Dd�/��-�&o�k������
�7(l����&���{O���Z�5�L���n�y]����
R/�g?/"����Bv�8�I����z��}J���("���������b�/�f;*�JvH�].~���*��r���:X �;�?�'F����3�_���(�{p��|���U���>!y���}���?���;�?D�U
�t�;�:�6�����TC��"�gw�����K�����F�	Y��	�[�C�������>�K�-��%�F�;�b����](v��	���of���WD�\��NDf�H5���D�@���~�.��w�X
t��M]����(+��in��1fqC�5"^�X���N&�1���n��L~�U��x�k&�D��Pl��G@���'Xu��(�g?���U��/V��'�>z	0�����q(��k7�k��R��'`$���-��j����|�3'��>�H(ve].s�;�S�s�=�a���/b���������'cJ��1��
9X;���|����w�c �Q�|�#����nw��j�c�_���b�zcm*��cL�SP��s�0vF���H�+� l=�����Vx<*"�c.
\sWW��:`:��
�}�u����S���QYG~�9��>$_m�^|>�dd�;���wv3���?�s��S��\D��������;��"r�1���r�����l��Jw�C���5��6y��ga�%���:��>�������6����}&$OC�ie-�1U�x�������t����o;V��/��xEDN4��
k"�Tk0;hxVS�<W�+l��1O���b��]1�/"c��1y����7M���/b��*�u����k���S�����������@lu���o�y-�TFD�����:��98JD1�l
���m;�v-����`�Q|?�����T���������"r�1��@�!X���6b���\���.��	�\�wSu�ts�u���51;����P��W�c�����-!����;9�A�w/n������<w=���2��u�>��Tjk%��XA�Jc�z	q^�+�>�~eM}�/���`��@bmR�0M�Ec�~��26�Q������\��a��L�=r/_x�l�}����D����n�B����Q
�I�^�����c����@���e;/$n��e������k���������'�8�C�&���(l���	����-���a��?
�Z�}��\Le�}��,�c���K��/�W.lZ
����9���-�#�����@��l��k
��L��oq�!��l=�O�}H|+��\\����HF��q�]�t ��=��_k[g��b�:���eb�j��J{�[b7��4����H>wv�a+����.�80����?X��l��������I*���qw��!�W'��i�zrT��9�w�O����Xa�k�S�w=�n.:��`g�����]���U�����a�����5�X��b�U�}=���;�T\ �?����?%��������+�l��ws �h_��j�+�E�c�N����a������6`__x6�
�bG��u��(5�(O&���������2�tq�������@�/n�U���{��\ +������'�]r�}���B�g5e� H�R��������H�4 V��������w�������Y7C9��z��[���y	k#����B����2�<�U��
;��;
�[�w�k�/�g�*�����������p$��q���{GW��c�|�(b��Ou���52v���@��a'5�<&���`j�Pp3�7c��y����=,V��t���I��9��$����;���7��W#���4�[�|ay�JG�'a�(�g1�}b��'vpVm���f���Sc�	���=i���a�Y'"W`c��
n�HO��Z1�����g���Ae������E"�K������TG�3�=;��k����("����MH��L���^��G��z?����@��>�`�1fz I��i_�v�������O�wt�1��������b��.0��WC���~e����n�Y,"obg�3��=; |v�4��L�L�L�1&
�&"?�z��2�e��-"b��E�S�����1���1�:����1����=��
+d=i�y3��A9+�]���	r���r�v��2�J[s?��OMU��"�4V8?;��q+���3�|(��"2;�y�	+�	���Lc��B��Z`�/�Nm��M��Js��hcL���;(r��t7��	b'�scL�/�\���2����m��S��(��(��M�}�o�\;2	{������x���0��&�
�Js�	y"$|6V���k��x;k��1����$��YD>���FD0�6h~�;���u�0���-����������=-"�����=�T�^
�5!"��`*�����P�*�aX5���B��9�<��	�����-�7��j<�E���L�=����2g�6�9�"���������_b;��H����tq�M���NP6���c���,�j�1��
t��p*�c�3ia6�P�Qg���a��|������i1���+�����t��U��S\H�@�	 Q�}c���Pm��I�>���A�)%0����Y&`d��!"[�������*�N���fc�����)�o]�Ac��"r=v���u�_����}���u^�k������H����fJ�0���jc���U��-"��*�y��boVPN������~M2�����I0��\Dfc��c���}a�U����)�Z����?�������>��
�w�;�q��SI>*(+��7�W��L��m����~�L���.0��Y'"oa�C����M�c��y�����7���K6�'${�
�������
�7`;U��m���;S���j�������"rvf)#e�|nm����P'��D}�gC�_�5U���7����UsbR�|ay������%XA�LD���D?c��'�0�2U�'��?�|xu�5�����j�����A��N�I���o��>�k�#~O������{�]��l���N~k�Q�����S&d��d "7`�~�b+��$��{��9�@`�	�Sq�����4>>4�\tZ��H�D6ka�1&l���a���������oP�Am���1�~�kQ�����0(������~����+c�����F�)�h}$��z.;$��L�[_G����yq�>p��pB�5X-���DY�����h�:~�;�swY�����E����|�Y?����d�y��d��fR�����
F85����Z����X��o`Gi?�� ����
VP��t�F�������N��1����ADF`g�N��|��0�,"��> 	� ����K����k=�Uy;{�@�7����
?U�~Edr�����n���cv��v�,h��|��%zr��{
���}�|��3�����P����8�k�v�Q��X-�?cn
�Z[g���?�1����
�Uy�4��5���dx����g�����d�i%��	y��Gb��("�0>O�"rq.CX�4��LD�j�]m>��ua���{����l|��1�TD`�|�u�Tu6�<��V�`���qB���M��>�������z���i]I�{ "b��YX�����?���6q�c*���������R7�����1��4�7kv��2�2��]��*��V�2=j*W|9�J��w��9n��i
�`�F*����n�}�n�Z2*(+��X�C��+m �`U� �O�+i�h�9T������F�
1��D$�2x�)pZ�_�x�uA��,s,����60P��3�m^cz#v��Q*G}��5����A���:	�����:l�.!>���:���Q���"�.(q{���g}�O����H)����<��OC	'�\������w;;�s�6���g�1���h����kS�����m�)"9�����`��Hv�Ye���������pjp���)X��0�����C�kw����Oy�7C:��������{�'��:���A������tt_�
�?����������$��>���7c�T�c���N<�$
{/'���1fi�e5�~�U�����*}��'A���5�wsw����XM�������kW�-���A��Ds"2�Gf�I�#.i6uk[�����=�C�k���;Aap3�NC)����z	����Mj��:����W��}��Z��c��^G��Y���~V0������H
[�o����]2�lctr}���4X/�~<���bd�������|c��=��nd*��@D�Q���
`�H*�2����Q��"����~�7�:p{"��o���w�`�
�/���
�Iqv0v��;�6�w��������*�eZ/�1����.�]��>4d������@�,d�����4�
��[��"�.O5�J��^�H�����
��s�`���g�b�)6����-����{q���v� �Q����1a�S?c��0[l�=
�o�c�aN0O}���v�kz��`�c
<����y�3k��/�����+���x��g��E�qY ��X
��N ��D��7X��$��w�������~R9U�5��7�M����c��)�1���W����}�K����
�����Nm��VJ]��I�	�"r��E�WH`A�8��������H���.v�"���K ���|��o�G������E�]�����$�B�"�ED�� y�������6�27Q�1����6
xKD����H'���y��
l��HX�q1�[c	I�w�)X�B�6k;j�"�c����6�Y����'c�����?�����n��"2���o\<~v)����-0���j�L�	�^�l����4.���t�]�C;�a����5$_	{wE����H���������"R��r3"�b��I�z��s���������w�.c����>�JU=�[���c��|"�����X��Zqmo���]Y����+s����`�������Hz0���
!��5�_���?�����?�#{Cr��m�L���i�k��,��>���2��x7����!93)")"r\��ZLD����C�����m[g�5L�~k�7�T����h���}P�����N��NQ����ov���ED�i�5���`�X����z\7��D����+��%�3���L"r:���z*����<:��T]�;��z������H����?�u�R��6�i��'������-n����F�H�����I��;FD�*"i�<�E�?��}w:����cL�/�����_c���&�O@����"r�X�'x:p��[�oVBE�%�^l��'"r|E&�}����{�F P�6��bi]*�hvv�i5���w�E�	����[��Q�H�$b�kK����a]��kS�hH{$�r|�]��Z�3��������j�;�
�O�\kv\}��7*�^�U'�f�t�z'�����]������n���|���.g-e�m0�X��/��[K�8�c&x~o]�(v�or���uc1v&�9���w-_�c��(�j��l��s�����<�{�M���������Y*�q,�����ufwa�?����o�L�
�j*���������'a�*N�UO��������w�����
Xn���]9J����$$�I��z3�/`���#[�8�.�N����8�Xu��<�|�^�UO�����[�5�����v��^xu� ��j����}���a�/����/���co�:�z+T�r���t���\������_���F��:���y�g�v��[�KH�G�lf����m;��>����(���������boyk�,�'J�z��x��~��������M�{��vx��c�;$o���^��%w������k8�y��7��U��}__����}���@��|��_����.��(s]�^���Q�Z���
���w�+�[�q��}l[l�b��1�st��I��Q���R��]����T~����T�y>tu��>x*��s�|b�kj_����C�~��K�G�7��=�o�b'�y���!Q�����_�c�H?�w������R[{�����5���������v�^k�����{��w]_�]��mC�/5�3�yV[G���� IDAT��2*��w�}����DB���Mji[��z�~/��1��`���0?$�)����#�k���|
�/�4����1����/����v �]����"Xg3�Nm"en��k��k�����|������X�g��Y:#$����	�X������s}�m�N�x	��V*;��V��?ga;���W�fcz�j9o_��e-�#��q#���,�'pm?�
d��]������1�d`�>u��n�Z��T��vr�h�{n��2���V���}���6�=]���=HMR=�����X�C��MT��d�r��`��|l6+��
�i�P�u�g��7�e�q��f��X���gX!�����xK }m��kF������`�o���q�=|��L��<��B�:�~)�{l��kK�+(��~3ov���^�/�c��^�e.�l{P�}E��Z�#���wvau��~��H_��|���y�N^���&���������S����n�p��s����
�����o�����]�X�$��O#�����;QQ7c��J��_���������;vVs�J���sxa�q��]9���}�����m�� ��@�����:��=��X
�j�����5����R��8�N{m�6w��������������<���o��i�a'�VQ9���=�j�\���wu���5�\�]k�����mX�q�{������6u�/u�c���)(������y��:B�d_�:�I-iw3�fok2�GD~���6�c��<���n����
[�'c~��~���\l�y&p�e@�1f������1��@�I��.��#"k�c����a�F\`�y)�2+J��%X�&�L���){"2�
:����.��(�C����
4g^b��#�S�7�#�KT�lg�]�:�zr��'
�J���z�8�
���X'-]�3eai	)G�eV��!"]�*o���� �����(-�N(�����EM������t]���a�����]�N*����~���v�����w�D�&ZfEI�z�l6hJ���r_��4c�;�(��B����������h-�[a�s�(r����-����:��;�2+��({�#���Ln��c�$�H��W����grc�	][[Q��I��E����$cL�*s��b�!d�xZD$#DX�������]����W��7.WEQ��+�(���q�]E�1�$����e���V�����Y@��/[��[�IU�+U�N\�U�����Lui������}W���������Mi�7���'7v1�f��%��(��uEI�/J�$s��9����zV��u����
��b��{��9���Hw�3�m�?-��f��]� ���.���XAxD����#�2+J��������4��(���E��+J"h}Q��AP�87���]������/�snm�8��2P�`�@�ZD*�������5�,�
��9��^�.�<3�1���+"���������Y�2+��(��(��(I��T�E�b����H��������b�%��x��1�e�1�My�ND��.�4��b����v�����+"�cU��]���i��.T?GD���m�wq�	��+V�~VD��U��!0��1fw]��(u!''����4��(���E��+J"h}Q��i�|90����mow�Y����7�|A�~\\����+�V=�1�-�Y������
>�k/�'"2��m�8�� �v�����K�
����R}��(�r�a�5v�f��%��(��uEI�/JS@�!T�DDL�g��s�q�M7�j�*���V�R��HMMe��Q���dd�rd�(��(��4"�4��*(7Sb	��=��]w��Mc������7B�e�����.��~��q��w���PEQEi�$SPn��������6mGu�
����������O<�S�N�������0�f�j�"(��/J�h]QA���PA���j�*����P����={�}�v6m���3�8��(��(JP��fJ,�k�~�%R�CDx��'��{7���/�8��(��(-U�VEifD"������(��(JPAy/����0�����My�=�X��+��|P�0%��(��uEI�/JS@�N���Q:���?���k6�Q|������Fa����,Y����?����������k�3f�"��.u�(��(��(�A�N�yg �,��B���\��kd�A��wF���)++��3���3�`��m<��#\t�E|���1�<��3���!��EQ�=����4#��(��uEI�/JS@�y5Sq��l�"��#;��~����1�������c�2|�pn���j�w����a�x���1beeeD":��4}D�)S��}�v�����.��(��(J�D�y)�B:v ��7�q�c�FY�pah��	�������S���(J��]��Z_�x���$���)�����8������#w�}7�����1���gSXXX-����'�|��������(��(���������^74A_��������y��7���3���?8������[�t�h�����{�������	(J�Q�0%��(��uEI�/JS@�N��V���dnX�8�",+�����������Ca��Yl�����z����3l��*iv���_|�\@���+��u��G}�@wFQEQEQ%�[8�/�Z!${6����'����j��Y�`EEEp�=��q�F��W%MNN�������������7�`��y��jEQ�SA��/ZW�D���4R�JrI��O������'����jE�d�2e�&M�Dii)#G���w�!--���W3p�@�,YB�n����cE���D�N�:��kEQEQEQ��.�LIdy(EQ�,�<��(��(J����EQEQEQe����(�R���)���E��+J"h}Q�*(+��(��(��(��Qn�����4]�FYQEQ%�����(��(��(���!TPVEQ*P�0%��(��uEI�/J,
��h^�9�
���(��(��(J��������������F���6���tQeEQEQ��%�W��3^�|�VR�����? �ck�QV�N���1�y��M~�Oz~EQEQEQ�����
���Q�t+�?���%��J���q����{Na�����'�(��7�%K�p������C�~��>}zh���'���Bvvv�6{�����(J�P�0%��(��uEI�/JS���u�~h>D����]��s������O���
�T�����o`�m�$��SVV��g��g���m�x��G�������oC�}�����Wl#G���=PEQEQ�y�VD���������7�m'O���/(�f�9��(7S�Q�"�6$�H��iHv���/\��#F���_6v�X�����^%����y����3gN�YQ�j��(��(�����n�xF.�o��t�z(�YV��5I�QNM�A���dw r�4�7���MDH�E4e�����E������C��k��_��7�HJJJ���(��(��(J���Q��Z����xF.��;���zxGx/yeS�ke�q�A��cG���nJKK�1c�g�����Z��#G�h�"����6m�=�w�}w#�ZQ�.�.LI�/J�h]QA�K��|�.
&/d��^cS�G�~��N�_�Br����tn�~�]�~3/Lj9uFy/�B�����{NB��iiiL�>����s��wr�Gp������Y-m���+~|���r�-�}���p�

V~EQEQEQ��)�R:o#%N��l������oG���d����a�)|b�g�#��UKlQA��t��v@��X�nm��9��*��Gu�]vY|������tF���EP�Z_�x���$����OtG1%��������l��AJF
��v#��^d��MJ��U�[]1(	�
G����Z�@�vZ��k�����<)��,X��~���Fy�����q#�������������S�N,]��;�����?�n7AQEQEQ��a����m���>]e���F��&���d��E���H��$�6>TPn�DF]N�����s�e��F�!����L�2�I�&QZZ���#y��wHKKc���8�%K���[7�{�=.��2v��E�N������0aB]n��(	0k�,�W�F��/ZW�D���<0�e�|����WR2#���;��(�6t�O��q���I���z��C5SYJQ�=Ks^J;'J"h}Q�E���Z_�.�vS���P]2k5fWi\��m:�czZ��zi�06�N�I����r3EeEi�4gAYQEQ��D
e_n�x�J������Mq�M9p_2��q���HZ�/��LAYU�EQEQEQ��%��ZM������KtSA|�#�cq������9�-h�QAYQE�@���D������%���y�Vl��5.��;(�����+��Q���IOrI�*(+��(��(���E��rJ?]W���|������T�|S�!�H�s����r3Em����6���(��45�y�������_W>�N#��d��M�	�H��0���QVEQEQEQ��C���*f���m�8��R���u�O�M��.Hz�;�j����(��T�vaJ"h}Q�E���Z_�NtW	%��P�v.���]�;������J�I��8������6TPVEQEQe�����qn�l:�W�6��15/���6/!�,w%3r)����9k�$NG\�H?�����N�mF�K��P�fJ"6�s_�����u������e,zg�.���s�7���m����(��4M�n��md@�tf�Q!,o*)c�W�Y\P��tj���)-�t�z����q������zX�JG\�ul����FY�3s_��[���/n��G{U���������(�)��7��d��Y\|���Y�����(��(��4A�����u�X\P���V3sP�
!y@�t�������NtKaUG\;���'��H��9��IJ�6I.i�A�����������'����G{T��d0��}���9QVVFj�������)���E��+J"$��c�TZ���~�9�?�����z|��RmS"�:����B���#B���R"U�W���UD�H�Mlc([����WR<#������=�����v}���"��j�
V���J-���R���^��=c���`�}2��2�F�^������2e
��/��/�O�������>b������K���p�y����RXX��A��8q"��7��w��k���m��\w�u\y���|������������7���S'���N&M�����3f�����w�}����O�>L�4�?������Y�f�wcEQEQ�&E�1�-.cya	�
KYQT���V���������f�/hgy�g���|�,Ox�"PGh�C��D���"������
����f�&�#�!mD����{g��/�����m�Z9�����j��LIt��[������3�[H���7�;w��W^��������[7�q����)����Q����[x��'8���IOO����g��Y��?���;3u�T�>�hv����+<x0|�]tQ������^x��S���C�����;y��g+�K.���'�D��P'J�QeEQEI%Q����������E�����RJ��l�5��1�o����8z�V��t��c�t����Guf�����d1����l)+�cZ
�vmGv�����XQTZ�.�%�6�J�a���t���c�=�N�:1h� �>�lf��	�e�]V���[o����#??���l���Y�h�r���� x���y�����K��z����O?]����n#+++9�(��(��$���h`V���vvxMq���M�H�a�7�9�����<��v��wa�l����C:0��>DS�|X\K���TZ������d���U��&�!��h�AZ3w��LTP��o54��r"���:u�����U�ff&�v�"�2a��N�J^^�Ha���dgg3m�4���n��=�P����r��G��/77���>�H$R������+�w��=��V���*���E��+J���^xy��.t��E%���d��?g��=2RYS\F~y������oG���X[\F��~�y_"b���QvGM���r��h�Z\a�Z�GN~)���Y�����.�4���G�3h?������g}��:_���.f��J'`�aP�����6���d@�R���9��r�����)���vk�_a�O�,���>�������3���'��o�]�vi�����)//��������g���H�K��G�x�	F�Q-.77 4��(��(��JC�3l�a}I���[�*��o/K���~�)��F��t�����Y�tLKa�����C��iUa��>-%a;��1xuY9EK� ��"c�jZ}�	�A����S����#:�����55���hRf��������,��",3"�	���m����R���e�8�Vcz���.N���g�����1��\t�Owc�B�_�8�	��h���v8�'`vp�c�c�W!i�NZ�\������������WMk!�7]���'##�v���{�n&L�PWZZ��/��i���>��Cvv6)))�����e;w��m��\y��L�0�'�|�=z����'�|�g��`�U�����(���E��+�o������\g�C�2cX]T���R�U:�Z�f�k�e���f����V�}3�l�4rRSj��	���������#���je..'m�Z�f�y{%����,"�
�\��q���8D�+|I�1��r����V�/p���l,-�_���WZNfD���.�>�/�S���k���m�=4;����[����3����}����7�00xX�~�@D�o����p��	p�G��2B�5�7`�+�;p0BD�c
|i�t��7c������Q����������������E�rc����9<v���T����a�&��7m�gsE��K.�����k�����~�~��<�����~�i��Oyy9�����g�������?�O�>D�Q/^�5�\�1��N:�u����cG.���
AYg�EQEI���qm��~��n�).����XS\������s�w[�-*�,	��R�^����}3��7+�>�id�Dj=FM�z>�Szv��vQ<#��m<{
�7�%�f�1�'�c{�1�'�}kV�2E���~i��������s�\#�{F*th�������$��F
��,����a��<�����U:)-�o�h^�E��1&��^�2��	I��f��_ �{�|`�1�_�)�����1�������6�|������������
�d����s���c���.��Sc�X_�����c�pa���i���h�}q	y�Ve����^���Z_�xiIu��q
���g\�?�S��Qc(�����h��r���]�_�HW_����.r�+�v�������lo]��}����4�J�O�����#N8��$��n��(��6R�v.�3VR����iLR����O�M��#����H�n�/.��]E��/���B��U����:��U@x��������H���DDQ?r�{��b�����*�Y@p�_�4��."+\�����P�1OHvi���K�e"���y�����]����<��T�2+��(��(�Dp����]�PZ��K���������F�����0Z�	�N0��>!�J�/��-{�6%Ra��> +����DbPk��'���bJ�[eg����l)�=@f
��v�*�'�"�G������;g�rZFN��ME���R��/��]Ev�_������������|��yn���3�&�����s�XG����D����~�f��2������3���8��"���B �S "?�~j�y<��
�z�����D�3���1��@�~�������K���p�QV�&Js�QVEQjgSI��_������d:��T�gV�
��Jc���fg�f����m*�����8�E��!clo2N�E��nH��$�6�cXWRV!4{��8��0ZG�����:
�-rF����=�XX=9`�����P�;�
r`��SCZ|�N$m�eVEQEQ|<��g�#��U��h^E������\�ws���{���@Nj�_v��Amt��� IDAT2*T��kq��0�e�|������|���2
�aq���M������@m�]3�������+<"<��)<��>�Y�����6)���P�����Y���b�H�RP�������nW�d)r���c������
���o�EQ�FK�#T���%^�YW
����?����{��r4���g�L�Rk�W_ay�����'�v�&2#BVD��D��������T�O����*iR|�#��Y)v���������$�!��7�su�}�*�����|�nJ���xF.%�����o�C�� ��v����D��Jj9�""B��4�e�q�Ox^�l����8��]�Q��(d��J�9; <]P���T���,���,"C�7�j��cJI
���3]�?-"�",g����a��O�D�\��q����999v�a1�(��&77����~��ry���������{$���E���K����?���c}�H���2�|FJ���~V�:Ea	��1�i��1����y�S#
�5�2��~��dF��_|J�}FMf$����!��c�%#"���#�Ez�H2#�~<���p���dF��sf�N<�82D�����a�����O�������*����d���=z43�`��/�����]N����6��2��}�<��������P��Z��O1Q��I������e_n���o8:���X�G<���z�i���~���0�A��%���9�]�������r���E�x�}�)(!���1�'����?�`�?~����WD�+~gju�Y�(������v`�1fMH���}��&`�|���m�����6���DQeEQ�=K����0J���oWe�9��U�Wm���w�L�!���:rj�6���5,.(��q�9�����4
��:�W��S�Q��6R��Ot�N��B����!��nd�����z��k�$^���1���A��B��i$:�{j���w��	�����E��1f�/w�30=��
���Ue�H�k�:�s��(��������	�#B�w��^�2+��(��(~R#���
�����i�t���lh��[��-�d�n�|6�'�gX�M3��POnj�d�4�:�{����Q����ne�i���k%s�Sx���q��j=N�skkk|R/�Gv'�&=�E�+zf��33�t�T�^�y���<o����PD��q""��w��X!yU
��s�k���g|a�������w���������X��<7c�����4�l
���[��K���o�����(I#���(5��E��d�c��.f�OQ���j�E/}��#������k^~g-\��y�1���m2�6�+������BH��%x�����f���������M�Ly�v����S_�[
)�`
�/,%��v��B�7��~�t
��Y������h=�H�}p!�]N�{��yJ_��0"B��t����?������ ��~,��t��������$�=]�6�,"=��@������5�<����
�9�}�1"rL�p/{3���7E�u�:��;�{90������YDn������,������s�z��|qy��+@?+"�z��!0����W���\Z�"��=����U	�T�������C�Oj���3gW\qK�.��q��3�<�SO=��o����PEQ�vJm&���(�t]���v���7(:���MJ��j���r{�f���;���6���=����F��Mq�5Q�<����^�3m������������[���������)�)�R�be�([����)[�Gt]���N����*���AD���N��t��`��6��y~�.����e�Ilo�
R���g���Q��W���e�9��
�����L�m�Y�;v�{�"���Z�	����j��D�R���AX�����1�,�E�H�`�;�G����/C�v�
�'m�E�����B�&Z��l�Z�"��s����g��B��T��^����V��c��)��7��������(J��������(��%�W�Ez���
Y�Gr2���H2/����E��j3o�  ������9�]�F]�gO.�fe��]��o���=�K�w&�������nU1}�g�����������]%�-�B�+�-��t��mv�R��X�8��.H���J�aSIY�y���SFZ�l���3/%q����'������
�_Q�FTPVEix�1�����f��&
��f(mnv����CXxDG.��A�����0' ���2����90��kT��rqq�-K��P�����m����%�m(�������j�m�:B�.�t��N�.i�������j.�c���E����:��l�f�Wl��koER�:s����s����n��m��0��>*(+�$���/�6$�
��z���+�d��)�_����:��'��'�p���f���t�������b��U|�����'�$##���'��c�1g����F"�-[F�>}7n�[�f��U��=�������O�����w��{/;w����.��;�DD�;�0q�D����������cx�����h�"���Z���GZZ�\s
7�x#s����k�a���deeq�9�������4����_���g����*�?�>���#� ��������i�f�Yin��i���~����-���-��y�KV�R�P3[457\@AY�a���c���AA%�����kf��<g�0s>���|��~�)f����XV�\IJJ�y}&��AK���"/�����E�P.�Z�.�8+�����+;�p�����U��S��78r��9��)y���|44��y���O��q!	���EMA��E����@��@�<1��JU��=#~O�9_��4,����/@Id[]��t�����/7x�UE��g�j=�N�T/j���(��������	eR�kG�y��Xn�����B��D��?a�`��.�K_�q��������q�F����_d��Am$Ib���l���VK�>}X�t)��{o���j�*��_O����8q"�g�f������?��;v`44h����}��^�Z�v-��o�����={2|�p�R��k�Y�f�v�Z,�T*���OZZ��g���,\���~�
6�e�:���?  @� �@��Q��v*�����NE;?�^�����V{u47�a�����>�5��1�y��������{0���2\R��p��8��i�5���!'������,�E��C���@���K������rb������5�bY�Q$Y�.�?�p���%IJ��&���D�I3��3��A:�]B����J	A�%U� $����T��PA��H�X�����y���{�������[]����4�?3���?
I��6mQQQ��=������=��/�����k�w�)o�9r$ii�D����c���m�q	d���\��^���O������?`��]^��W_}Edd$�<���^����{���n���S��i?�0j�����}�����INNn�y
M��
��^
�|���o�a����XY�J����>z9���n=����Y�����G�vj��c/}�(�h�v����6���	f�5:����e���WK�K���OY��2��Jn�����%h�3b?Da�i�c��e%�5E��B��������v�bY��K��[��xD��hG�1i5�2�X,��w�xE%��-�$� �~W����2w���{�n��Cv���5�1�6��	��Y@Xdi��w�&�B}S"�]E�]F��P��b�W�����]l�"��C��.�����5�5���~Cn��H�.�����m����3��z���rOw�����<�=Q)))dgg#I_�5'N�p�w�������9�;v`2���ln!?p�@�M���>HVV#G�d��y���5�|�@ hM�O1��f�����I�����w����j��}����X[c�O��osx���*j��8�x�g�>�����{�]L��LE��)n�{~-5��Z9]���0����8��}����)�
U)�F�}%;�J;���~������P=�t�>���-:I�|����o�����-AU(���d���PL���v�[�8b�c��}�8���������(�Qs��a/�p��N��'>Y���J����;���DrRg=ComsA�v�h��6B(�r�:�Wc�rvv������F����&������������S�N��.wCp����w�G��'�������������'�V�������z���������3}�t


3f���:�?�|cOK �(�=��� �ACi��"[��������F9V��}�����G��Lz�@�TZY�1�2p���$��+(,��uG=�m�
)����;st�e�*�k�A[�E��J���^5�h����n��&Xq����hMU`����
.c���.�nF�O\�Z�C�����@YX ��~d�|8`���HE�I�����[Y�o���}�s`�C^���PDsI�C������2�m���r�RB��JB�����V(%T*	���Gg��y���c��W��ac�y.�T9_��OB�"���{Y��*���A-~�E}��@�V���U;�]��=[.��_Y�Y�p!��
C����K/q���7�|RSS��w/�w�&99�g�}��1���y����+0�����<�h�����>�
���3�?>��w��}����W/�������`0��~-Z��zo����N�=0�t:�J��@ -,[OP��t����V*$��t���z��zT����������t:&�G�	;�Y�g� 8�����/?��a�!���������8v��/�yzC���bn��MU7�XMd��_,��aH+� ��|�(���+q�����
F�@����PT]BP_�*%E�3�xg/]�v��SV�s-�����k�����	�T���S��]�m��v���T��������@���_�o��
�Z���z��^Q�\������?Sv�\Y��]u�N�@�����>�yS!�r+�%`G'����|�K�^h��H����c<x0����|��<��S���Og
�!I���C���3�A�a0�;w.K�,���fYMF�A��=)--e��������z��y�����7������s���jy��G������c������kt����o��������2y��9�N��������;�[)4�:(h�z�Gm���Z��O�~�D��0���uuZ8~o@�5���fs�yV��,b�������m��_I��b^����H]xH��
�B��(e|�EU���)������WAH���p5�jB����*H9�M��l�����pA_�B��i�uLKu�2LL�0%�!#��q����2����q��#���T��S�:�t�
A
v�r�����R)�VCX[
�t����������t��Z8[8�{F\�,+)jZK|K����|�7����J��������u�K���wF�������!����u���
������P����x>��x��P3��@�T���P�@p���>I�����|��XF�OS����eu���WC��T2c(�������r8�����T��/\u�Z	���
O�t����
��P��O�C���l������J�Fh���Z��_��	����O�QZ�y�*����{�Y�#�i��NF?!��'���T'����T'XUC��v�+=�t�	{�	��2,��J��G[��!#���{��>Q("}�M
��T��:yv1}�D���4O$���P@��*�J M��s*h�zxc��m�{�7V�s
��6t������7�b�*��6��>� ��R����
aQi;J8c��@e/%���t�z����J��Jt����-�������
��I�BV^�����X��}	I	��_� �(�2��'�\��y�!0�}n�+#�OHAwS{$�s�A�-�k[E���D2��V�l�����G�+o3�6�������7����"�B�[�:��:��u^�k.����1���X�2/�<Fn����y!_%v���v��|ns�v�el6��#���v�h�}�koc�^{����5�e�80U8�[�:��&X{o��Fe�_����)Ak��&�	��6p�����
�|�$���~������n�Cv=�L������ ��U�L�c�\�m��g+{�
���Zz���3.�A*
	��'){�[�=SI����4.��8/�E��Q_��k����r�+2�.9�{������;;����l"M���C��5*/qKb��Bw`��i}���T�`�D��������C�; ���*���������J*M���&����A��A��NU�������[�����A3uj=o��u�%�^���@pq0-���"��6Z���Z���5��9b�1�<���bf���9��,1G���pw^_��RM�D�����?���	::4�"������j�B��dH��Q��v*���5@�dP��H�����&�����"T)m��/�Y���kc����vH|���6%������k�A���o���eK���5��u�rW�����)����*<���t��B(�P�P�/B(�i�n��6��j��B����{��y��Z�m��F�{5�o�(&H���w?	J3VYB-y�7��������5�(�k9q������}o�aN\�����\jb��4e}�m��������Jn����y��~�������v~�l/4����s7`/��$$���Y)���Px"��@�|i�BY�94���^j�Fo�$k���Dk:7Y�I���w_�a�FW)}T�L���Ye`��}�I�~�@2G���Mh��6a*~�Zx�d;���=��6�<BO?��O�
�����.����p�X4
�7&�����h$��
v6�o���,���G��@��@jp�������Q������A:t�u��Q�*��"������=��}�������:�`�Q���l�:���B�{-��8�)��.b��$���SU���,�n���t8d��O�sWV�+��#�r�?Y ��T
|��@{ce���
����~B
�1Q�iz��@��qY��Y��aQn���@�|i�e��9a�.���5u*�A�D���} �� T�Q&�j��F��@�sO�\a�v�����K�)�~���q�t����"�O���D$m��i�*��z��=��lg%
�jNs�6�(e�b O�����L_���^�*d�Y�-!�<B7�K#��!�2����f(���X�Q���~|
��HT hB�EY �_
�O���_�|��e��[F!������-����Q�r>&:�����/��g����z�bY��y�a��Rl.1|������c.�s1������.�/h�����e�X�N%EN�cvFhOq�6�PE]��B}M������D�/��N�<�!�2�����S�m����I��^�r������>}=U|�%��@ �X�r�1���?$*j8:]�G��\@N��$&�u�c]h�s�e��L�����/h���O>a��el���������g����.��-��lQ��������O3(��?�6m�LE��S4G�b��8������I��h���,F��n\g�|�S.��[��s���4���������#�8f��7N���S�+����d�Vm>�j��W������w�
������';$������x�7�#Ko��h/�C�x*6�.>��J����c�p(������1o���"ACe�ys������$�/������fs�7��h<P�����
�_�~�\$�7�q���w��x�F-�I�&�/�p��"Z)��A�s?bZ�[�:��q�v�l��8r���5(��g�!�����>y8�*�i�-��eK���z���S���_��uP)PD�"��D���7����8���?�C���,����Gn@g
�	)(c�/������
����U��Wk���Xm#���z1�
��#ce��H|
v��.d��\r;���i�1�C$K��P?f���������gX 8�JC-�5��_2W_�o�:e����������(��#���
�%[��K��h�t�zD}� IDAT,���3<����")$�)�E��2�]�3��.��%����_��E)���G��t�V%��+c���hO������~�t*0�e�h������8$����-�2�l.c���lZw���������������S���DV��g���c�V���+�������N-@ ����Z
MiQB���������EcEr\\��w��/�����|��,Z��m��1~�x�?�n7}�t�-[FVV�_=�1Z���K����e���
����L�4i>>>dee�y�f:w����~JBB������y���(++c�������H�Tgl�B��E�x��7(((`��q���;��_c��w�^f���o���Z������'���g���?�@����_���o2j�(f����u�P(L�<���{�B���KY�d	W\q}�����X��0g����x����0aUUU��=���WSUU�-����o��N�#==�;����3g�����T*�;w.�&M����c��iH��F�a����Y����_!���a�*���������X%��z�.�(n��2�����Z�%����J�~�@
	E;?T�5�1,i���y.���3�����h�_��>�H_�S����2�����p�|����^�e��g\�
&����Zk��M���o���c(��4�x
��0f��!����@ h^�k����r���f��.u���$���l��������y��4h�GI�X�z56l@����O�.]������c�Z�������{w&N�����Y�r�����?g���F
Drr2w�}�����]����)--�g��>�!C�\�<\��5kk���b��������_�������c6����;����������3l�0����:u*���S�N����9s�p�m�1b�>Lzz:�F�b����x�	�=����Q�T�;�����s����OYY���l�����Gs�-�0u�T�m�Ftt4�?�|��_�2����b\/�s(�����Q���!��a�{����54�b')$�Q~(���\�Q'[����n�m�i��8�l�eU)P_��c��g�p��E��o�5�MG'�wt������`^��X7h�#�������_�^�~R4c��2����G��g�����uU`�C7s������B=1����,��!�����}�w�����C}y!.����j�_�[$h\���@�@$Ib��iDEETG�����"""��� ���]�|��#G����R�d��qu�>�����3���O<���?���0�������"22�Gy�F���/�z�r���������(--e��u������zBCC�1c�}���}||<'ND�$����'�3gj������FCff&�,�d��������/O>���Xj��9s��T*:t(���8p�Ax���I�'���:"Y�-���n�*�/�Z�*1��x��&c���l���L����o�A`s�(6����<���T����M��B�c!w���@T��,R�e�����/(��1�o�J��R���H������	����w#t�~:��t��O��zt,��ov���~s�d/������rq_�wV���Bk%	m�gF�����&.M��(�p�^_l6o�h�rt�����rss�����p?������Fx��+�^Oy�gd�����<��)))dgg�~�z�����y?~�����v����geea�Zi������ps&J�����z����SPP��d�g���:Y�q8�di��
����P���n�
��1���"��?��?v�����D�;����]���I@�W��,8���bF�����;e���#D�����`����.Xw�c�h���Vp��UF������	S�/]�Q,����(����X��S�I>Q��%����h���3*�/��cW:'
�x��8��x���<@�`��|�WZ�l��]j�N����d�,\�;��Rc���@�B�	��������+�]���<22�Q������dr���;�������S'������uYT����������a��U^�$IB����EGG��j),,���CHHz������Pj�K �GY��l����:u>O\���^���K��3�5���&��3�Y�[�!OYln1	�@T�v��V���,5[9��rtL�����$����Sv��M�M��gV���|7c�^L�B����~&��)���{:�q|GJ�#M�at� N��	U+�~Vs�=D����K�q�_}��@�\�/����/�b��e?�d����|�d�k"�2.$''���"^z�%n���F�Ojj*{��e�����f�}��:�8�������������os�m�5x�
�lmo��FN�<��������h4��/�x���m[���31�8>����<
��)S�0c�


���a���
���#G}\A�"==�ROA��h��b;VJ���uE�^E�GC�}��K�(g���5�mC�C7���~t6h���}f�SPF���0Y�W*�8���m���� ]�e�'x(3�7O���rv�WQjw`�Q�����%7������bV�{Vba���>e�[${��?�/Gr����v/�����!��Oo=aU���QA,h�!����	���.5����H��[$h���IL��n�^�c�u��n�^>k��_I�;v,�&11���$�z�)dY>��QM+k��3g�;W�~�<������j2b�z��I���6l�;�W����9�<2???������K��mK��?��-[���B���	��[ou[�r�5y��Wi��=�{�&  ������6���w�MFFAAA�9��v�@�
��]�
��"�rE[������K4�?Y�)�;��6�@����.��qkF.'-N�2����f�,��,Il���3.��W��]���+����X�m=W�M�;���}�v��p�@T�W��0�J�d�@�"��Z(�I�\�����>`����l5S8	M�H%xR�|/e3����\�#��7�l���F-�Cfw������PZ��R��F�s�SH����j�(N�9�m9�;��r�������ES�5��zs �R�����=Qu�hs��F���@��������ia�����A��Z�%a�������2[K+�������@�VE�NC�^����S��`��~|�����}��J�S��N��i�g@G�g*��1z�[KAZ��s�na�=G�@ h���_'����T���+��l��������c�.�N������o������*+[K+�Z���.��q�nuPI`�!D�dVL0�|u���Dk����C.-��b^��G��z�W\��������a��B���S���UP�x7��J������T��Ac���9 ���O�����z
����M ��-���/�(����Xt�-s?�C��g��C��Z�cY%G������K��$*21Z�u��W�tG�^�W�w�^���<Q�Go������(�.�%	��9�{���~����x�3��~DR���������Bi�{���
b��������L����Q���!����S����y��
!a�9�O���nu!��������p����_lb[Y%�������J����O��m���8|�N�����jG���gb����Yq�����Z-q��P�<IB�����dw��W������"�(�@ h��>�������N��a(#|�w��������|�0�������SV���C�U,Z��XjbkY%?�V�����<�;4\��w������<�O�R���a5v�O��c��mF����k�������;�g����Gj���O7E�+�@ h
�,7b_��1�������0�[7h�nT�4>h��[����<eTr��0��v����O� '���-)=
��T�7(�h4gO�WY�9j��C���R&K�����r?}�0�sU��������*��B-��D���f��/f��9���X�!�j&�h�����l��t��E���"h4�oU�@ �g��uX���S�3�J|M;��]mBU<9/���8��f��Q�}��[���Q�P���@gP�7(��;5>JL1�q���&c��Nn�JW��`����4?z��Qc�g-���b���sv��^���"9!��q�A�{�[(b��@�|{�lG��v��lP��`t��7z�*����,d�����^q�ss������.*����JQ!�/��yvt{lh������>�@�����(W��j���V����9�<n�]����'�s�����V��c�.=}���(��F�l����B{���Ci�`*[�la��)�������P>���-[��
.�<�#==����s���F�S(dff���p�������>��k�=g��_~�#G��d���>�@ h�X�w�*������%p�0�]��7�x�������):m��f->��%���(Q�_���g�m����Z_���^	��v��[k���*�+\-��NY�h|����8V��Agm����>�U�[EJ-�@ h	�����`(F0�~��]r�0n�8��w���,�����{��'�x6��c�������fC�h}.�b_��!�>��q�&�V���L�'uZ8+��oX�.Y��es�����E���Wv����#�D<bD]&�TI�
�?�K�eo5��X�6��Q2�h3�X��6�F�$(�}�SgE�������#�[�A\/��������P������EnRg=Com�d�����fC�j^_Wb�����ls`�����v���������"����j������,'�{�+S�t�Z��JN����z�egU�Oc�9�
z�����@�Dr���r�EJB�Z!S�o���s��J��J������X��)��5Z���p�~4��D]�H g���C���Y�}
I��������\��m	v�%J���yk�*K����2����8^y�RRR����������t���=�����������NU���n�������c\�B��#G�4i>� ��
������{��\m,X@bb"�����5�-xj��P(X�x1:t ((��{M����u�]G�6m������_��_~��+�$((���H�O���z�U��G!<<����v����{�;���>���n���:))�1c��_GGG������WZZ��	#..��^z�C�-Y����;���OJJ
�v��`�o�>X�j���=����?pZ_
��-#66���P����n�p8�;w.���������4rrr���p�B���HNN�����[�n��O����+..�y��������/��s���:���:JJJ����'���*�������6mr�����9s���o_���2d���\}�������?���}h��|A}8J�(�����&�>}%���S$g63��c�i����=�U$��%��!x�?��v$�Le���9���t�[$7�d���"x?9���'p�~�!��c�x��h�������[��}/���Hb��Y��3�m����]������z�����S~��������i�p�����;����]X�M'�I���I��A"�,�c��1<�L;�<���=9�okC���}��aJ�� *V�C�F�:�,�[�A\/��@�2���X�"|�E�1"����~���1>�_|�A�y��$���W�a��Z-}��a����{��
:��U�X�~=��wg�����=��+W��?��sv����hd��A$''s��w{k���l�����Rz������2dH�v��f�����k�X,dd8���T*���OZZ��g���,\���~�
6�e�:���?   ���x�5�0s�Lrss�Z����O9r���
�v�Z�����1�=z���O3x�`��m�]w�����y���X�f
={����������o���-����E������������[9x� �W�^�5���d�x�
>��3��[GRR���;z��0�k����_E���s�N���n���+���X�|97�tD�v�(������~��j�{������?���;r�
7���o3g�rrr6l+V�������o�e��Q8p�6m�+W�d��u�k���C�2o�<^~�e�l�B||<������Z ���pu��C�A�$5����1�������%,'���K��XY�v
,��X�;�;:��������j	z���`��=}���j.�m�$IhuZ�������}����^t�3�Z.����
���!���$1m�4���

�#`k��CAPP���j���#G�$--
�R��q���}���	$::�3f�;�'�x���0`@�����������GA�����K�^�����z�B�P���S��K�Z��hd��}8�����8�~��������s'�7of��!DFFr��6m���z��n��j�*^~�e|||�����Ge�������<���������Dbbb��7m���#X�|y�"��$?��3h�Z�v�Jjj*�w�v���^")�i����+��g�<�|�I�j����{�{��\~��H���	�j��p.���I�~����7���h�Zn��v��	��+�������4hiii�]�p^G�'O�}���t:�����[��uzz������a�|��A���dE�/�?��^�l,�����\�q�F�g]������~�@�"?��$wY���R6V�����H>;�J���b\�K���O������0n�k�|�X����gVm�u�/�4/*��E���"h4�_�E��r}�9���.�111���zmWS,���z�y#<<��oyy�y���<�T%)))dgg#I_�5'N��7�����9s&;v��d2a��HKK`���L�6�|���,F���y�(..&%�ia�$����:�^s�5�������5�\C`` �6mb��m\s�5u��>}��Jll����\�O�8Ab��^Y�Y�x1������|�	��w�tIv	�s�����l���*++�e���`�w��j���j�5_�t:�q���X�z5_~����f�1p�@�s�v	L����M`�\ R_A��a�2~�(�e��0���S|���X�g4���G��Z(�$Q%�$��l�C[m����`��(��#��@ .aQn���N��Y���i���FRg�M����|��?om]e.�����#;;��yddd�������������F��6�����uY�����h�����}����&��?�;w&33���R^z�%��~���l�����<����NLLF��}o\s�5|���l����������M��
����j5���8�v��NQ�����X�$�x�b����.�����gMklC9�1k�����)..v����s�m����>�oLL�����h42k��s����f����U�X:�������:��H!z��z�<��������a����(�^m�`��5l���:�N!q�����~Dh�};�5�\��c��l�p�le���9e�]�S���`�����0u����E���"h����P����&�2.$''���"^z�%n���F�Ojj*{��e�����f�}��:�8�������������o�Uh5t�a��q��I���OUUF��_~qZZ�������`0��~-Z�[��o����j�b0��t(�
��t	e��Ldd$}��e�����{�:��J%c��a�����������o���w�	�=����y������e���L�E???��_�������1����sO?�4���������NQQ���S�L��w���_~A�e***X�v�yYz���N���K6n���n�l6��������<BCCQ(>|���Z�3%����}�����gt����p�����p��;�1i�I���[m�bh�BR�Z�uA><���m��Oi���#�+�Y�t6h��[aa������!�d�_��y��@ �B��3���}'����K�>�N�YW�/�M$Ib���<���D���x����e��V��9|;t���-�: IDAT�9s�A�������[����G�A��=���;��
s�����8������o���/����m���C���y������������S=����:u*�������c�=v���ERR~~~�H����$&&��O�z�c��������@�~�7n�'O`�����=��c�������#).����7�|��u�x��g���������9�1c�0x�`�2e
f��k��={�d��M�Fpp0III,[������y�k��5k�0w�\��������7�����5��=�>}��^i-�}am�����v����:%w=����C��T;����2���]����aL�s�CY{Y;r�l����X�5�W�@g-������X�>��R�"��K,/h�QA��M��n4q��RkL�Z�$I���I��l�
�����x���Q(dff���X �H�����)))9gZ��Fzz�pykA,�)ft����8e����Y����`o����fvWT�����g~%�����D������/�����_����(�*e�
��=7v��G��.>Z|�bm����n4q�J��i�}q"��@ �������b�g��(����z�b����d�,��2[`�����)����]Q���*��cM�:���?������k�@�z"��@��9�����fA�i�'��
)L�e��6�yZ&��E���"h�,�K��1	���C�X�[L�������.5�,s��l�������������'��������3i��:u���Y�`Q+�.��r�������9lG�iGY*�.N������0����(��@ �Fe�����G/�����n$���n��*��l��cf+�"x�x&�?eb�a>m���S���,~}7�~��������^`[��^�&��:��������R�g2���(�bRpv�w��1��E�BY �f���>f��e�r����l�d=�a[#�S$�T���;I8�)�M��g�	l]D�u���*91�;��}'����HA���`^�$JdY�E��?���K �����y	�����Z�.>Z����{!���J���8��aVEG�:��;�?-��C��k��@ h\�`^Y^���D��.K���@�MN�@ ���B�6>
�;����o�����-j	*�����6��sAu�����hO�|f��$��aA��A�A(�r,����j�e��?�Q�
�*�rwY��a����Gz`I���$i�$IG$IrT[����
I���$�L��RI��I��ZO�HI��I�T I�I��_%I]O[�$I�K�tT�$�$I��$��$��"�$I$I�Y=n�$IK$I
�3�,MEk�]9qe�r+�����d�Jo�����pp�dacQ���0�H����wg��e��� �~=��=���P�/��Sel+��("Y�iU���P�r��F����l�Kg��
����J�������A���`||S���*�����"9 H�=k��2����78�^����A�!�Ac���9���y�jY���,�e�
xK����m�$�%��
�,n��$������8��[$I�J��?j�
~B����r��)I�]�,/�5�*�&�`p���\k�oT��! �"���$��e�T�mS�Y �`��,VDV��MG���x:E��H�b�Q*"�2������gv8��_�E8����Y�% J�"N�&V�&Nw�1N�&Z�F������5j�{���C}�C�Yd�,��T]��z���=�����hVV���0=�����7(���@ h���G�NCI*�&�������.�r��'J�'��������7K����$�����H`��,�Cj�}
�0\����e
�"8��e�����+�
Y��1�<���,����Bp����tm�$i�0[����z�5�4x����bF���o�\����W������������eS�La���4����'��l�26l�p^�
���$$4|�~���?~<w�}�y`��IDGG��/��msy�[;b�r��� ��u��O<�&;vecj�@W����!�ux�H�Y�6:��"��ZO!��Z�����c���\����_����>ee����N��f#OY��NY���BV�LE"?����	�#H��'R�	��Eq��(����
����!2��%���$I��4������+I�j`�$I��,�WW�2]����C���2�`u��o�:�[8���8�*���XPS�����$IG����'��Q�,�r��.5�-vOYl\�;��3�K}b�B�7�~��5�6n�8����S���1m����nJ������8���-��J�H=?_��[�b���c��:���m<WPt�1�I� �D�]I�CA��$RR�P�V�Va�hP�vH����EB#)��l*��������O�q(��;�e��)�0q�����nW)/��W�)�s�^�v��'�y��p"���=������_�����o��\ I������/����q��������:I�����z��S��D���	Y�sj6�e��$I��m:��%I2T�_7����P?��a�p��l�K�p���
�C�������RF:w��X�D������,5�x���|R`��S����H��]FY�@�W�/�����ds�gW��
�5j��VBS���(Pk$4Z���')7�q(��>Q;��d@f�&�)�\j��(yz ���V[o���Z�A�#�Ac���9��_�Y�v��H��,I���"`��������Y�����Uum]������V>��R��7��E�F�w�1t6h�0Yh�-���2��`W�r�~�����:4V�s�+��BJJ
����u�]TUU���Ntt�G�7�x���T������r�
.]��~��y��P(8r��}��I<���6z����s�]�`������2k�,���=�B�`���t��������=z���3�)S��~=~�x���_��,�����������)++s����\u�U��e���a40`3f��:����k���[�n��}{6n�8]��z�)������G�e���\w�u�i���;�z����I�&��p�
7���G��}���g����S'v���n�����Q�#!!����}�Y���������K�.������fgg3|�p����7o���A�:�e�/�(������wxTU�������Bz#	R@@�U���,X�!�� ,�mU��������4�6dAQ"JC��$!m&e2������d&0		p>��gr�=��w�C�����nf&Tr��|�L���1+~9f����fn5�/T����#�i7����z�� ��*��	Yf"�>G�IrS����(��[))�p,�D��rwV�ss���qk����N�0c�Ht�x*`?w�&��W����h����$I��(����<�M)�,�w+�j����}�8�4��k�36(�M������-��l��eM��1{�6Ynj�$��-Z�w�}Gnn.�w�f���.-�B/^������?�7of������O>��g�������T�N�Z���_~���������������;B|���l�����7����z4~�C�������j�*BBB�]�W�Z��)��9s������f��}TTT8���2t�P&O���������{�?'PBP\\��A����?���p4�{'����_���c�9s&����Z��v����h����5���
"##���+=z4EEE|���L�4�;v8�/^��3fp��q������K���7����x��<��C�l6�Nff&G���~��W_u&��|�
��z+���\s�5��i���$''�d��<��G������	��*3w-=D���\W|�_S�X�N���,���e"�c���
��C�=���>�?)Z#��$IS��;����~A� x�����*�o�N�u6��e���xC��5�*��R���_gJ�L��Q�7(�M�������,B?7�����m�N����}�������Gw�Z
!��wm����N���������]�>�����0|��z��'���#����k���ns&`�����FXX>� }�Q��i=�������e�]FNNW^y���ueee���M||<Bn��F~��'���������Z.���v��>���t���9s��h�"���
n��"""���p�������7n?��*m@�������0a�
 !!�y��q����sg�-[F�;v,=z�`��,^����~�#F���	���_��o���������y��7����8~�8��M����>��c���Z_u�U�=��g���\�����>�m���������m��F 2/@������Jn�4�q�q��/0Fh������R8�}V��>}`����/?a6K2/��I�n�*,fI�N�0���c�/��6�;��l�l���o�L5����`6I�F_��$�wh�$:�����1�m����l����b�jza��QV��Y�/{P�F�����������H��E!F�V����'
�C����v�3��P�5���m������V������CYY`�[�YI)[��
�k����
���D�������yn��9��Xg�. �������~�q~�������5�:��;����e���R���I�����5f�q���}{�t�R����[e@@���������������g�yF�=ZJ)��9s���^Z��B���+��r��qr��i�c+W��wm!���}�s{��%�s��n�]����~�����R^p�288X�_~�����Y��5�\#?��S9q�D�t�Ry�����s�����;�9p�@9{�l)���;w���TWWK!������&M��<����u���266V���J������Ag�!!!n?w���1c��R:T����n�9p�@9k�,���/�(}}}eXX��+88XN�4�����wy��������{���:�NJ)�'�|"u:]�k���������k���R���_
!��ju��Z ���/_����T�\���Ch��f���V���J��'��]�]j�w�������U��l���!��IM�y��-��N�b���*�,+0��5���������m����}$'�#�����������_�5k����(���+�7T}Q<��}�%G��EY�}��T ������xo��������Nv1�q��;�}T�\��:;^74��mB�D)����B�$�	��
�Nt����.v�?�Qn���Rwv��1������P����	�<�����W������

�����c��yu~�}k[B����-��������m;a���,}�Q8p �^z)w�}7�����b
%$$�{2����N�#..���$��_��<!'N�����C��l�2INN�`0x��JJJb������e>99����z��O��:t`���'���W�>�ff��2�@~6����Y%q{$�cBy��8�2���b�k(�c6����S���-EJ���
��R,{J����_��b=��D�����������L�������+��(������!.�c_��o�87_MJJ��=Q��1Ctm<	�M�R��:�|tt�o\[V��K�h8�Q���:���>&�>����t�[��c��gE����_�5�L���uI)y����������3f0r�H��O�����m�6m�h4�������d^~�e���8t������;�'�{*55,X@VV!!!�������������[o��W^y�PQQ��O>���#�h4�5�+V�x�b,���l���^\o��>�����}�&L`��9�����l6�����k�K9�a���{�n,X��l�l6��o�9�e{��]t�E�����K/Q]]��je���l����k�����������{�s�UXm�=V�%����k.3����$������5d��%��x��D�����v���Z�Z$I�Uf�[�0~��������_�1E���x����/��VR����;�u_9X���=-Y���Of��q��.��Z�������xC��5�8Q�
�7�RJ��/O/&�#��&��Da��B��
�O��~��b��A�g���2_���
!��>�X/�)eemA)�R`	���}!�!�,`
0_J��N���S�E�
!��B���'�;p]��Yb����������[����{�5�O�����5�!C���cG����6mR���]'8==���~������A������nM����^{-�z�"33�a��9�'7<��uNg]$**��b]�C�g��n�?~<c��a��������999��K�2s�L"##���d���.q���{$&&r�u�����Dq6������9s�2e
aaa8�^����	�������?�m�������O`2�\��n����Z-K�,!''���������;�3������'�`��������������>6)����1���Y��	������|��)�����gwS0"�����?3�F���I�yzj~<H�{��?�M��/(�6���oS2�#��/����0.����B��tj7sT}�C�]�i���h3" ��A_�������(�������GQ
</��G��X��@m�Zm��fK)/oP�b`:�G���RJ�Y�-�/W��6��|]�"B������[�/H)]�6B���Hg�����t������t�o��������g����/?y�f��h��w/)))-�r�B0�|���<^V�����>�O�wU�0�@���e��dJi�$h��z_G����Xw{����?�H[��b�S�e_T7�Q�� t��hS�����M
��{�/���S_�@m_ �VTE�5����������:y`Si�����T]Q�����)G��,x3FY�$��JJy������#�/{����5�[���������m��EQ�v�f+����_
'*���B�Zw�G0~B,qm�\_�����.��D��Ua�r�I�Hi�a=������Cl+h��Z��65����;�0��������M�����D������(��(�y�(��x��bQ�f�&zR�k�'�f�dyi%����M��I:�hmWC�2_&�������x>b���M����4��
p����j,{��p��-�������h�	�&��k�������gTc��&:��$���G���+�7T}QZo�^������
���Zg������ZQ�gs��������c�,*�Sh>q�jQ-	\e"�g��g�=q�]x�sS7!�v� |�5X(�k9��
D�/��6X��e���>"����R��l!��������(��4��t�.s����Z�$���RJ�m��EQ�3�����,,*�3���M�'OB��0��D��:����OF|z�NDd!/f��c��%�6��q�7a�):��4hS����������E5�P���+�7T}QZo�y�QM���(�)���7��;V���JN2-��V���[ifx�0FOmG�~!��`Z�Ub����c��<lE��|�Z���z]�k�k��"t�,�(��(������J���^+J�u>u��R�������"=e������&�����\�mw�r���b]'�����Lk�1��'�����t���:��Msi!���l�0EQEQ�����ZQEQ�3�YPP��=��O��U�����5�6�5�
�g$1ph8:�w����X����1�95'k��C��c����O�t��hC�s����(��(��:QB\\tp��|!�\���)��(g����UXm������rV�U�t���>+��k��D�M�M�c���X:�x���S�!{r�����L6�&6�6��.��s�/��]����t��95�P���+�7T}QZ�W	!4B��
�^�j��}�B���|���,T��&lE�X����zS��2?��3�:u:�����r��W�����}��yu����={�)�`��q<��G��+��lR������<B��=�m�Q~<A��)������z����W���SX���'g��(I��jV�ax���/�x������';O�$k>}�z�"&��D�)���G��+���O� IDAT���S�=Y�r����(��(�<oZ�F���N��N��m�&���P9=��#��z��{�'������;O^���v�m�v�mg��B����)��(�j����21������XN|�Y��L�r����	���b��X�_r�{J)�l��N��iM>=�N�I
���v�^����D4a��c��m��.�^�qm�l�r�Yk�\�Z|O���xC��5�&Q|/������B�Q@8�7T����_�F�[��,�������zk��_��l�+g���M9U��<��7E�C���'�z��Y�?�l��;,/7��	�^������q���	��9����*���+�0e�v����E�����,����Mi�����aMt�J�EQEi�7�Z�_�;��~y	��)�R�N�n�E��(J��l	��W���/wek�i;E�ke>������/��K"""?~<555dgg���T����3���;aaa�9���:�s�������c�.������{�e��a���r������hx������#���<��c�$���5
���.������{<{����	wnO�8���X���1c����r��������}{bcc;v,z��y��_~�o����������y�+�.��2|�A�����1l�0bbb���`������;���;��;JJJ
�-r����=����t����k�2w�\������u������L���������HX��{>���I���F�"#	���p��6_�p��0f/��{2��H��$Y-��t�3�)���������w�<I��K�#}�����I���N���c��Q�����[:�,����
U_����D�
�;��X��&���M���7Ir�E���w���������>}���B/^������?�7of������O>��g�������T�N�Z���_~������������������~��
��y3�~�)��/?��;t�@hh(���V�"$$���|��Un���3�?����l���GEE�39?x� C�e���?~����w��5LAqq1�
�������nc��lL�0���<���p�������'�l�2�z=k���G��s��_O���)))a��Q�r�-��������`����>����dIiQ�+�� 9��0r���kC%5��LFI��5�>l ��r"��&�J���'��/�����������5ZJ�eG1�o�A��_Q��e�}A�k�c�r���iC��m�\Mt��D���������-�G�d���(��(�;�t�^�+��DJ���!D�|����)�!��wm��`������<���<@\������������#F��w���}��C=T�����w��������>b��	n�����Jhh(�]v999M����Evv6���!����������C���Krk-\���~����������kW�����E����+�������"""�����3p�@����?�h\\����'�|��/�����h��e�������k	���c�����of��<������p�W������{���O��(-��d��B=�
�������M�U�9�����;�����$��`���U7F��W�Y��x5��S���v����A>��O���d|/KF��Z��j��)UWo�����$��k��B���m��]����QFiej'�jj�c��iU���:99�#G��-W�$4Z����]@@�C�8�������K���X�l����w^VV_�5���0���,�������K��ZG��]�v�b�X,p��aRRR��'���o�%$$������?//�.]��z����*�L�����)--���)%AAA|��'����L�0�~��1s�L222�� ::��������jl6�)�`~��e%XN2�]�o%�;A���9j  P�5����8�d9����G����[6y�]f�31����U-���(���'�R��B�,�?��W�5�d)��&�O9Mug��S��k,�=���&���e��			^�����z�{�;������������nO��g��m�	�eee�������������K�������o�)iBB��N�#..���$��_��<!'N�����C��l�2INN�`0�+;s�Lv���������!''��={"�D��!C2d555L�:��'�j�*�?�u�R��`d^A9��)��-;niK��b����`����$\��)�@�#=q{4�a:{w�]%�V��������$3b��I�wP�=9�JB��z�J���:U<����
U_����1�H)7H)�a�|��+NJy�����P9=�/��$���#z���%�����"??���f�����#�z?��wg��ml��	�����>�r��y���)++���C���k���������T���Y�`YYY������NVV��sn��V^y�8@EEO>�$#G�D��0j�(V�X�����X,�i��zq���ddd0|�p�F��{TTT@�6m())����s+,,�����������jU���p������a?}�8�;G�O����o�������{~%mf9�[� \�y'|7���d����}�����]�P|�O������'��:|��'��D�:�����k���>]%���(���j^%�����R�__�M��t'v'��,�V��d7���.�r���%�`��Q2��;�����i����':��xzz:O?�4�&##�����;����
����Zz��Eff&��
s�Onx����,��HTT������h��=��?~�x����HII!00��_�w�^�t)3g�$22���L6o���{��Gbb"�]w�s���|�A�������o��\}���sm6���
m��%22������~��������V
��)���r�����U�F�����N�7��aq����h����k�
d��37bI�*Lh�xn
��a\����w����!pJo��A��;	�����.=B��s�j�Q<����
U_��@x�J&���+0V�9I1�ZJ�Y����#�p���������Cf��]o��3M���w��F��*JSB0�|���<^V�6)YU^��cz>?n���x�j�j�Yj$x�	���V�Z	�����&������jH���e����Dy7��(��(��r�>��$��������������J�EQ���j�����P����eu@�`�UA�:3� i�1�EWIWm	�&������lc�y�|��p��~m���>�X��Z��gj��)UWo�����$��9q������R�+g�2��O
�/(g��������������Wj!SWAWm%]�*��� Tc����/���h|/K���d|�����W��(��(�����v���7F!��r.��K���z�*�(��EJ�/�d^A9_W`��x8E�����	���=t����
R�T����Pmk��%���\���p��E��(�RuE���/Jk�M�|�����~@^#��3D��`2����m�P��d�XN������+gaa9�L'~�i�2�����4K���`	��~�tkD?��of,��Vb�UB���\�zjTEQE9�y�(�xB���rV�B�������28�{���c��
�����CQ�����	��|���G�z�����u�r�$�H�v�1l%�.'tO�nK�������O>}����z��z��egg3������_��xf{����*�RuE���/Jk�M��"p-���A ���p��g���xk���1��������[�,+�T]]�<@VV6�
���
|&�dIq�
��_I�=��-d�)���rz�*���r��'���1��p|����'�>��������eD�]����T��(��(�������a����@�cw)�0MJY��*n5�<��Y�x���(,,�f�n�EQN�N��k��<����������R��`d^A9�)�8�o:Z�{��'��w��)�����V���cm�M&��x|z��	�o�w�(��(��z4��P^%�����Q���RJ���a'J�v�����#R����Pn�����,,�3���U&���^���0�Zk��?[-�������}:���&~'��(��(�S�K���w�D������fzV��a�{�n�0�V��������������	Z���m�`�Z��j	b�5���$0nr!�M�d��xC��S��(�P�E�Ts&�^�e%��C��v)�Z!D0	�TJ����TNExx�W�
)�����C����T��&�1����zcxmR����U?@���v����r"Oql�]T���K0l�3vj79�;�',B� +��(��45�[����j������?
�Z({����S�+X�O��(+�rz������� #��onp&���*��}�K1���.a�wc!��O��X������p����t��sL_?�m��q��	D����EQE9�����@,p1p(�= ��B�����6<EQ����_�IK
"cW)E�?'��A��8@������b��v����jS��3Q���g�1-�~&��_����7M�a���$$��B���(��(����]��0�m)����$�~H��4�������4�SS_���� �UJ������g��	[��	z�a':w"Q��N���`W������W�d��$Y���o����=��NJ�$�om�����vm])�*��-�6���s����xJ����(��7-�Q���jEQ�1�:>��d������~=�I�h����/�>��.�Dh�?���S��������W�zS$��K$�s`�eN�[[>���_�������w�	�����u7�K�0������(��(Jk���<`���I��]�8�(;���Rf4[�����(�KJI����?���~��
-�����(�%2`P�c����U����,]\�h����3�����G��i*u��SX~��C����W7�VEQEiMZ��o�	B�7S�B����i��EQZ��X%��V`���K+�U�V���V|8����p����������z�ql�L��wP��D��!M�&�0[-�,���(
G9�?B��T���tI^�,�6(�%����dEQEQ�K��(�-�5p0�FG�^R���J�&�Z�o��=g\��~�����e}��1.�Y/��q��e6�������;�ga������oMO�����v�5V�*
8�?B��(G��
G��,�&�g�C�3��Z�n"=�Pae`�����~5z��hO;F���~�(�RuE���/��ZE�������u`�c�@K�{T��(���Va���*�����
�:�F��UzwF2��5���?�E�����/0����,z�S���K� ��D�Ua�W�;�j��<�1�����h�q$�?@``BhC+�GI�3YV��($Iga��E\�����v����{d��1+��(����<nQ�w�m��s��U	���Z����~;������/w9�?.����A��R��BPh�pC��8D��0�m8e%f���>|��U��X�]���Kb��.�f����%�N�{P��������Q*�<QH��B�E��U! ����,���%���D�0*}(��]ErH\���(��(����lQ>�DYiy*QV��'-6*_^O�������'�V��Q��]3S��'�m������������ zs_=���G�����+��w=M�U��>�A���k>nl|�S�'lDh�Dj���D_��"46�d�����f	�����6�?���j�zr[�Pn�8�0��_�(��(��P�H��i@G)��:�.���<)�����J%��7�XW�}e��Z�yC���=�A�7���>�~��z���_��|��yS8�"����e^�������������M���aI��Dh-Dj�Dh���-Q:m0�S�E�l�j�k�����$�Z|�R��V�?��~XLs������/����\��~�Op�����S��(�P�E�T���D�AEa�����R�/�<JEQ�&"��z�6*��Yiv9>gh���sK������g]M���b
���K�s�g� ���6��5I���)�j9�~k����]$���-�g"����0�O/�4�|L�����)�a�c���-&�,>�lf����/��$�/��:fT�P��wG#��F+��(���V��(��RNwl��d��l�FJyY�����Z��{��U�'�H��}.�
�|y��.���ki����B37��������Rx��n�A��!l��3Z�*�_�=��UH����I����~�h+Z{��g���@-RZ�h|�����[�Sn������{�\\��i�����-�j.�Hi���s\n��m;��z���"����c��-��(������U��R�9��o�)���d`��2�9U�S���x�����[���������y��.����m����H���x���ND
����6�j�� ���DW���x
�X�������!  ��W�&���F��o����������U���`���o8��QTT����G,:����l�g{����Q�J���iW���W9���~@N���Y�,�E�Zu�.z�x^%���(������s�.��TT�j ��BS����c}d��3������X���������m�2"���������d���J���=�s�$�|k���o���/
14���C�
p&��_���~~��t{����^���Mvv6|�LD.��1��Fa�*
���'����0�����!����7o)��kv����qy�>�J���S.#�7��?�i��-RZ������v�����t�8���������	��m��M~o�y����������$����������:����&�MQ���7R~�r��J]�mL��)�8��3����.��.��_�e��\B����	n�������e\~����7�l�>>mN�$��z�k���5������������|#��7"��`��H�WQT������o.���V��}���~ ���2$�b|���ZR�&R�0�J0�0��)j�}����%���cl�6�����YQEQ��7]��
|
��N��H)m�����R�k�#P�>��ZQ'�6�^�H�?����z�V
�zS
��)�P�;'pUD0&��7�|����������o���AT|>��?Ip���O��/�N��6!����V��Bi����Uo@J�I�Sn�����-&?�ECR�1*}(�t9�J��Rb6�a4b4QSSX'�=� �=��M3	]�������\+����Z�eG Y��@������?��}�(5���eEq�������0�9�r�@l�O���������]h�������7��rq�%5reI�~��?������M�����r$�G�o\�]�����j�����4�����������wzu���-&��	�L����F���L����c�+{��wi���9[��y0��z�~�v�ni�{+��(�C�I���C%��7���>�����H6���~�G����T�����1�,���������k��u�+?�9���4T��������P[}Q�������F�9��
7u]1��\�MB���b�$��rM����6Ylg;o&��Rb�T�M~�����l5��n�������OF���j�ez�z���G5����u��R���/��Z�d^��(�������l���v9V����.������N����g�������?wm�B����"�������)'%e<��w�z��z�Ivm��q<��I������o�5�F.~�����6���,��%6��	'+;�I)��Dpp
�.V�������������d*F��������mjj�M6�$�6��E����~~1����k��_�0��Y����jBB2HK���������>)mt�0�����(�rNR-�g)���(v��Q~����+\�e��d�](��'����,��|���������f���t���~��P��4���������w�&A'S#
��| IDAT�C�v�7���D�2�}^����T��T��\������;�o�.�
�t�u���:�ot��7������k1��_��uw���df��������)��(���z��P��r��5*f���������0�h�>6�9C���_#���m/��qv�o��]/Z�7L|�����+.�r�u����o-����v|����s��whk\����O@I�����":��}~F�����������c2�:��RG\��_?��Z]��n
���F��?����u��f�����#G����w�ze33_"%el���(���>*QV\�DY���6������;�c�z�����!��P7v'�n�����;1�46�5t���+n����}�}��7�����]���TW���yh	9�?�B�����i�B�aZk%!!i`�k�6I���q�+���FJ��Z� ����[�X�$���5��F�W���6�u��Y��h����/G�,g��	.����G���~�(���/���e@<�
�j���{R���0��_/k������\7x��/J)?sS��
�����Gy����S�@|<!�t������|%m���7axf5���zi��u����TL>�Q���EA��$9#�w���I�{����S��|�����NO���>���	�l66^����a(YOG��@���See[�:��=,]��Zl6#5R���b��~A����2{Jh|1X%~�� �
�VC�U���#S�N#>,?��&Y{��HH��K.��u�&`���%'�	����zGF�(���<jQB������=*��k���K���:�I��������;�^d��7���|��Rn�s�`�{�{���R�m���5�l`-�|(��[��S���XI�C�A�")eU���\�����W�G+�����y.����y`rW�v� �P
aU�+r�$������2v9�i4�ddL&#�>��Q*g��fee�z���C
~&Cg ���Y���������URK��R)�TJ
6
6��&�[z���*���'Ig��E�?Jb��( IgA�'q�e���o��=��u�����}���GZ��-��(�r��x�ka]
�'����������W������"����}��t�����c_�{�e��_��`���[�>
�$�#�NJY��?X��R>Z�/cO��I)�:�EaO����f�B�a���T)��u��q�u�Q��r�0~�����F�c_�����:�� �K#oV#�4v���!7L����})��/��Y�W�O������d������*�����sf[��l�
���fOlk���}
_��@rj����Z�L����~*�<QH�����(m.���L.�������
j�����[���c]��n��#=���JQE9Z<Qv�X$�������W��R����z ^J�$�H��{��rb�r���	R����@��2�A���<�)�b���( IJ�_�l"��-������=`��ra�k�j��]�^�\��J����c}l�'Va\���Xy��'���Y��2a�D��"�{�q���vp����v��9��/�/�II7�3]RO��ZW�W��8�{��;���������������f��%����Ot�l��P%5XO1�=]C�3:��Z�����U|WRo�Vh��N��\��~�=�
�<�����RP���5c���?L�����'�PTJ]���(gU_O��1�oB��n�m3�(B���X�'p��\���F�1�Q�B�x X�HY����:�>\7I�RBq��N��1R��~�q�n�+�9���Q�w/�z@�rlM�p&O�J~L��Vb���w_�q�1i{�����v�L�<�C�1t�:
_��f{���=]o���7�[~�Ov,&���	8f��}u~B28����B����#8f���&0s����0���Z������E;��h�m���6������L:�&�u�bc���<��[o��-[�CJ+��`t��(����D�(v
!�aouY�BJ9��b�{�2!�5�,��:��)�����k�d�����P����Xa�#n��\[8��=�8�kg�SYi�R����������0�/�J��k�c������D��h*�,����n]H���$�������e"#���:����tl��-�m������S%�T��M���*�g"�H���08�[�->>:�C������u�k\��	����Deu������������Qb���%���t?�����7�Yz�1+�q����b�'_�zOy{�������4.0����I��t�LC���=�5����,�����5c�%�[�NGJ�:Mn����TW��O��5�&Q�S��)#�wYn������	�|$��VJ�{+3�g�n��?V`�WO��~��lm��e�R6G�r���-�����l,p9�;1������P��$��jB?�qvz�&�������Y�
�s�GHK����y���*��_��
�������4V\������J���@^��b#Z8�S3o�g����w�	� >(��7����f{�>��s'�C��h�������I�{����s����	���n\�H�/��B��uNx��~��z�m���m����V:w~��SEQ�&�$��7['!���=9�,�|�����'��!:�g������V�j��I������-��,B?7�����4n�8��o@XX=z�p>����P�j�W_}������7�.�(�0���+Wb�b=?6���j����0���CB0�&������6����ZN��W����]��n��S����#G.$5�22ni�����k�o-������B>,�!�h��0��?>�����e�rWpAqL�����Io >����7�8�}�f����F&u����ln����iU�����xy�l,��eW"��U[�6���o;�w���}�:4{��h���W\E�<���P���<j��m3������Z��y/"�����-���]������[�v�����n=�999���o8p����d^-I����QR���^����
�X�!���A�����_���c���R���Mv��R���o �Lv_>���ql�LR����]��J)���zs�cj2/�c����2-���M�	m�"��&:[Qew}�ye�K��p_������Q�}~[-D?[����w�n��	~@K����t�>����g�����Z�JSyk����q�K�qaU	���`R��[(����\���-�>��/GsXwl3U���#w�!t�Hqv��4>����f��3���c��QX,���w��0�;?�~&�a����y���x�U�z]�$!���=|����MJ��I)�{�d^�
�| ���A���8�3d:��R��e�R�����XN�R�S6	�RPug����.�\�����{��z�����S��r��UQr���?{��Vy���hX�-�{�+�I�IYv{��F���e���2Z(ZhKieH���>(!�"���3<�=���~Y��b'r����K�s�s���D>����
h�%w�lw/G�w�1�xv:�_;�KL�����g\�R��=�#�:e�@g��F���	~�^o��3�H-�~����j+*7��j�������\sF�p����QhM?x�=uukY��||>{�}����0�N)�%�d�5BYQ�#�?�
p����d���
�����j]��B������f HB����`\���[Qg������mZ��(�u2G�	!�ma�x��'�X���Q.B���Pk��B�.�=�9l)�%C�7����]%��|;Z��zo�Ek����<������q�;���l,����L��IIG��iH$�B�7�	�8������NuO��BQ��eMen�Tf�O F;0����a����rA��L�x���D2��
��(�T`%P|�:��x!����W;��������P��W����y��B�g�k�/�r��������B��a�M�Em�j��������
�P�}������u|�+B�K������2�
�����
����P�k{�s�>R(KzL��0���*������������$p�M��3���n���'*pLz�3N+e�i/O�	u:����&?�2��]�!Z�I�R��eeP8��'�fX����Qk`V��P�va�d,1�?�*u��~]P,�"�c�^��I�J�<��Io�����D���}�4��S����pN��b��(��%�q�b�|�,�x7l�jEQ�
��Q{�)�hPe.�j���,nM�n�9���E���������}~BQ�z�h��@3�`�U�H����P���?���5D��^8�G.�_���iZeG��_�_�#w����=��>�)S�d��H��P'+.�sF��9�O�L�L��`u������U����GZ���G
��oay�z�W�@�h��2&b,�;�����Gy���B]�A��]����������+���m���}�_"���K�X�H$I��(7�!~L-�!2��3��dY8���d���Py�;���(����V���8�~p���_�&&�E.A��+��u���yL�����
�iH$������m�V����6���{�Zl������h��h���O�a����(JH$����)��(.>��9�>f���<�7R,K$�#ZR�]�MB��w!�o~#��� ��d(�w�x�{�������\s��z�G
�l�����&R�x��rc9����;;OQt��K����N��H������4T���r���������N����bL��)2!y��r��{$t���������Ws�J�,�H$C�h����B��u!����PkV8*�D
eIo�Z�������-�y���X�}p&�IR�Zx��u�9x��*�<�
{�;=^J�l�O�e�@��!��$��@��=�}j�vp,Uk��`����N�9�\s�q��g�c� '.�\s&)�E��i#�����D~/��_��)��b���-�� �IO�����%�������������uO"�e|��h<�]{��
������j���D�?�g[����?�^U��X11IL���������D�w����a\Xp*��&V���A��m�f|�����Pb-��Z����@�9���t&��c����m3�KJ�G�S�>,��D"���&�l>�����t ����w��$��(K���z�z�@uD�:��H�����4��	��~����m��[���Y?8|�yL���!u���H$����fm��|�w5��������<����j�h"G�}��d�a���d�#���s:�����}��["�Hg�"�:������x�1F����'���pR�)�%��w]����hj��4'�_�z}0"ps��^i�N�!�.?���+��6f�h�O����}
�d�	�n=>i$;�*����q�9�H��gJj����X=���\����kIh'�?w���-	A��/��@N\0���NNPDg����$��K1�����P�h��d�����;�X�H$��-��!���$�a k}<��h��C��a��,n�nBh�Sl��=�;���h%�_+�rd�4?��q�nb��k�j
��DE��IzC^/o�|�iw������E������c��^C�����?��������{�>���
���TC��M,�@��D,�|-�h�����]��3Et&9q��zbr>����X�������xF�����g����Q��n��y�H��^	e�D"��G%4_�	�DV`<��<�\Y���7�1;�i��~��u�h�~���6�%�F�M�
�y���/�H��V�>=9=6���x�Cz�%���d3�Guy��;�U4W�Du��^u�r��=	�����bh�����rw8}n�7�e{S�bZAasc)�_>�����0&!��c�$�D���6���,Z�bK�w����SD$��L��D�7�`��s�G^�O�7�?,��`
bl����o ��'b]LL
S�>DN�B�DG"�
V�v�Z�OK�(���>L����k���\��dr�hN�+���"���A���'�H$C���QV%
��X�]�M��x�_G!j��OI;�P�D�g��v�W�����gOz�h�����TtXk6������hL�W_%��;l��/?��*���w3f<	hhj�Q����^�LWEtE0����t���1938)���r���'��H$�C�h�/�O!<A��%��U!�����$)�%���j}�8_����#�~n�v"o���j���:��a+uL��;v�|�m��,X���������C�z��J).>�k_�=7w3f<�F�}����k�f��]�w�����*�~O�}���Iy���W��Y3��{}���P�V$���^$=%Z�y�� ��S�Q��QQ���)���0@�}�J�Ya�h��u2�-�������kI��G�hL�HF��de��������QQ���W�yH$I{��G�`�;,_~.We�^V�6B��9����eEQH6&�lL���1�����as�.FY�Y2�6�����Kq�[:9b;���c�^���M����I�4��I#e��D"����(�����t���cBs�'�Q�&��v���^�aw�5\q�T���6�8���%M�am����D��{����!��]+E�D"�*���A�Y.��s:3g�m�����n<�1�^���=�����>����9Z��N�-���B���E�!����H$��B��^��)��Y��F	!��C�$] ��d�?�?���wvD���:.��4������pw�
��=d���a��7�X:FY$�$�p8��|�"���{v�i���74����K;t�U,�v�B��~'�������Y�o���G��*Z�dN��\5�<-���9 _%�$Z��<��
j��/h���(�8V��?�D"���7�U��pzi��#<_Dv��3�9��#�4����m���[�msK���7�
bc��I� ��$��p�^��,_~V����f������b�8XV��O�������m���NA�LI��;����8!wi������.���o�����DK���@=�'�~EQJ��Q@P<��6Fql�)�H�@sM�?��&���"����fP��M�n���;0�Vk��<��s_%&f`o�$��`����,;�ms�++���__����@���N�1q�6�(NyBv4������t�*�U|�����ZW#�m��������i��#�
��1	���K$��Jo"���F�!���/��Q�$�:'�����C�����X?t$i��m��knj�����ff����������D��8�,_��#2�f����=�Y�Z� y.����
|Z�����fK����$!�����9)W���c��GO%������kIt!��d����h<�]�;#��r������%���|������������#�4`���D��8��,_~G���<�9s�T���>>�����V�E�Z�G�������yE��W��aS1h�?Z.�H$���H�,�
Z��+i���w��"�k�&p���X�m�w��S~a�=�0�<z�5q�oPd�!�������zq���|�Y��������b9�f{�>�To������l5�k��x�X����g�FP�N�= �kEr`��E�S��FY"�Fx7��x�{�g�}����g
.c���	�����G4�=��4�^���N���H$�&��`��"���!{U���^}9����*������5��Y�xh��T;����5|�8�������s���|�g��N�+���"���A������D2���!��(K����h:�}Dsdc���J������o����&;�k^���#g���_�D"\�j��a�E��KO?����j�_D�~��n��2�)���?����3/kZpU!��CG{2�J"�H�z-���������4]�!�"gu�~Lw\7��M$/r����B"Y�10{����:e=�H$�����a������E���PX���kb����E�Z>���K��� IDATO�VS����NA���9)�������Q|~�3!�\�l���~���R��������BY�)�%����>�v�t�'(��h�����W@X������Z��^o���e��
��o��#��$�A^/*nw-����Z#�rZ�<��^rb�!�J�Te�)�\�7����a���x���o��7M��������H�#�[$=E�(K$�~����4_�J �������y�"D�e��8��v�������o��8q���H$�h�hLc����|��X�[B����Zu!EE����us��DQ&��fR�hn�v1v��e����U|�w���������^��nCO���HK6�{�4S��d�L����I7%�nJ"��F6��H$���(QdDY��8��������]U��?z��W����C������&f���pU"�H����z�����ys�=5���s_�b�;v6��)�{W��bN��O��U����":)LP��=)$��L����d�H��0%*R�E�!>	��������'%�BY�!p<��#_G����7N��c�B6���K��,h���-!a������>P.K$IT���@q�947�aOM�CQ�k���A��q�ZX�oC���*65���>���!=(�S#�tR��n�&��w$���$�� Z���@�����T`;`�@p����pT�����Y��l��zfC���Q��G�����M'��`{�9���sZZ��/��[��w��"��$�A^/���4R\|MM#�))��;��CV,��]�VN��&�9�PPl����l�0�cCb�-r�QX���H3&��FV*>�q)7?*���#��E�S��Fy�l���0
�,n�P�H���|����%���j�|�4����lz��V��L��E���~��YOG��P�D"�bb��?�-����������~-+V���yo��G�h�j�
\���9���4�OO�+�|���Rj� ;.��MX������D���g#��j��X=�~���ub�:{Tc
�d��nJ
E�������X��_�������.���P�������s�H$�Oo"�N�Z!�?����!���#��9���(K�����O������������� \�a�����!���3m��Q���,�H$C���+���12s'9���X>43r6����P�j���H���Zg5�Fj\
��v6��v��e O�[���Y��r�����DrX-������TE�B���3�)!�1
R(K��Rs���<�^k����l��0�e��q����q�nf��;e��D"�!O3+W.��a}�=)i���ILL� y��T
���EMPLw�.5b�*�k����>���H1&pq�O�j�B�%�����Hw�E(��j}p6�4p�������K�����$)�%�a��e,�VH������:b[������`WV[7����{��3��5d�2���}���,d]��7���gx�VV�XLC������A�������r�!hj����ua��ZW�a����H��	���o�����r��c1�}x&�6�r�H��h�Q~x�	��(�~"���N�d��7��8����6D�w&�r��3�����h���[�WS�E���&7���Y"�H�z�������S_�M�������s�?)11I���TT|@~����!��($-$-� ����w7��M�hu������[l�->�y���r=������.*8��&,dB��>>+�D��j���(GgM�_�
A{
���(��k�eIO���Rq���m�3�,~t
	1![���%�[���@�51g�?��<f@}�H$�C�����P_9�/1q2��-E?��/�f����K����Z��aom�����NC�u��eN��	9{��}2�J"9����kIt!���'�v4R~�������y	\��tlq��-���%�7��@LL2s��Jr����D"���|������_�>����X��m���A�R��5+;�����q7�,9�Z��=N�!������29e�@�.�r��P���A%�������objv���#d�rL
�?6#B$��q��!�l2es�Q�I�|�l���vA2���K�����?��$'����l���K�j������n5.��A���9���_N>�g'������,=6�ON~�g'�a�E��q���q�$:o���b�/�d����������^�@�FTs(]/��Koj�Q��
���]�B^H$��cy9u�����E�?8"���;���Y�����o%Q�� >~,���All���,�H$��X^���E�,��nv�~���_ 66������ %e&		�h��V2@�������l��R�"�nw-����m%%����������wJ������U���sM������W������SR��iI$�v����%����4t�L!d1� S�%]��^)��?"F"���3�;n�D@��|k�o�4��E4��<����0"�yH$���q8����E���>Z����i��Ir�LRRf`0����������Z�����w ���	������e���t�:��q7�oy���~@�������>��',��1'b�����H$C���QVe�BT��3��#���3*�����t�FY<s|.\;������]�;�v22�e�������H$I�.��Z~�mV�1��II��<��b)@Qdu]_!������yV����46n�j�����������4���lL�a�L����,��k9�n~�/+���m����`��\9a!�����9J$C�h�n�!�S����wH�,i����cxt�v�G���SP���&z��N�]q����3���|9�R�+��r`�����]_}uv{	11I��
��y3�������-$%M'%e&))3IN��^����+�����Zin����5B{<�}�9���d�$66�6�-++j����M�_GC@�5�Ay�d�x�����cN">��}�=T���-s�+�����H$���V������/��g�����lS=_s��^bPo�F���#�x@F$�d���� B$���u�{!=a��q!MM?P_����u����������^+55���Y�($$�E�SRf7E��{�!���������-AA��c���N�}�^��2��2���S���z�����z�����z�Z�[����<��#O���0#}�a���H���D�o.f
!|�[/�_dDYj:���G�'�"�>��k���)y��-��d�=j=���wSPp��+�H$LI�dg�����������Ng(!p:����&(�����	!���|�!%B8'%MA�5��!������*��b���W����GQ ���9���#�������'�`��|g�R�cb�{�{�@�O�d xz�R��?>�q>ZR��~�U~(:|3!�������BY�Xs����/����p�-�lAd����/���[t�
��?���
���D"�k|>

���744|{@i���#1qRX��Lbc���������j�L�n�o��w���ZK		����a���q:�C����t�1L��G�~'.�>\�J\�J��J\�
\�}x���SO�h�����"��n:&&)$�KJ^`���:d<��L���������"�{�xL!��HJ�)�o|-~V�y�q�+"�NE���Nb�����_?����EC����Y���}�@�,"��0Io��K�!D��$$����a�m;�c�LY�&a�xNL�t��,��Z��X��B5����������(Z��|�a���bGB����B%J}!$}>GP8��g��"(�+p:+��l}>=E�5���)����4`2e3s�S����r�������E�5����ln,
�������Q���$�(�����Y�a|}m��I������aBj��d���z
zt�x��^&-�h ]�H$��(,�1X,c1�|<�&���746���s��X.W%���S^�>�F&�����l��`H�t��H�
������Va�p����=�d�&!�U��bO||>Z����Z}o~Fc�������B?���zmA!�*�Ua�pVPg-��]E}�>�����K��#��\�
�/_z�����:����#>~,�X�zK�� �tE���n[%�����<���K�}��~��G�%���(���v�i�[�sF�^���9���l�a?��::�A�4��{���I�qG��Y���K�|N�h������u��*�H$��~���S���������72����r$�8JK_�U�U��U�A�l;��w{����n��8bb���Mi��ny�/w}������������������c������4,$���,����$�g�px]��U��Z����nk%�m��V���w����o?�Z]H�|��g}e'���v�X�����t�dE>�=��g�^B���3o�R���p'�}����"m��K���/]����N�U���D2���5a�������@�������$&N�j���SO|�h,xh�����e��Kp:�B���^�A��Fc�b!��c4f�M+}��]�s���G{V���XE�����UEt��'����o�����w��OFcz��.�l�v�� 9t�����W��Z�nk� F�k\
|�������;������xxT��<I� �������q^�>#��E�3.�s��IUj�,���t�$$L$��O\8`�vE� Ni�����h���QW��A@KY&�
�z9t	<45m�kMUu@�R-?��e���C�`6�� ���F������j^��/l~����f�Gp��3Y�?S����6��v�g��}��;&&%(�BB�b)�`H=���-C�O��6���Z���VR��A�?�%*���(��
`P�n�[�yB�$ �1R(>,a/�[>&[�|��1���O��!!���
�9��>�OM-���%V�\5�p������w�h�l��3(�?�B�dc�d�1b���ys"�
�z9|hM�nm�������=M���&M�����AA<.$���������X����������w�pw1�n~V1=�����	���������SJ�ZX��,�=��f��
q8�`�����-�>1���e���N'>~,&��C>#�/i?>��go�|�/'���c
!�q5���V�n���V#�{�Ux�7=���aD�0����}�6[l�����c�p�����~\�]�E�W�&���?�D"����{l'��/H�x#�_g���s���=9WD�k��ql�Gde���Y���)�@�T���5���F<��=�N�'c����2��k$)'�V���H$����s���}D�m����%:�9�~�-Rl0$�g�~�t����x~����Uv�v�����Jj��v�g�$�
��w���'��qbM?�����@�����u;6�v��mX����K���@���m��p���%66;��\����$�����;:�M-���h[%������
W�����o��%���K#���WK6���1��Mfl
E�����!~���'!���?IH�|h#���k~����$j"��}1"��?��-��>����y�/1�B�M{4*R�*6�X�r������C���
�����1��<g���g�D"l����44���f9���w*���1$%M5��X��#��@@��l-�m����
_��_�'�l�q������,an�N|��7�dar,��A��e��|8��Z���w����jc���5���EQ��'�����Hg����-���������F4��m�GSK��#��KcD|#-Y���d32>�s:������:N�tc�N���F!�3]l�����y5�^#���������p��o�k7����1�p�|��'�Z��f�o��Y@A��L�xW�
�@�G��`�W6V�R�����/V�/�V2�j�v0��1�Hr��>��J+�
�z�tFxw�����'+W~Cb�g8�d�d�T9�xi��<��J�t%t��|U���)�ZiZ�Z�ZYZ/Y���J������x�Bu�Hu�HU�Hu�Du�H�0"�0W_�Y��T�<��SQ?���k�[���x�3�5�����_������{�m���Q���v�v{���=!���\T{}�5:�����g���g���w�)�F�g3���o}��Qg8���l�y
����j���l�
T�;������������G��xe��_�yB��]�-�n�����q����i����u��Ak^��ao�)W�IZ.{~&����M�^q�Zr��R��C���^E����4Wy�����������c����1FL}�H$�������Hn�����0g�
!=��v�
����9�2n�v	_V���M����/{Uk*t�����^?�EG�Q�j�d�����(��l���^���Q�Z����^�>=5~-�~
�Z7?7��o���p}r
e>6y��{x0��h�q����4�c
�[,��%�5��������x}\0
�����Ptx�%��������A����()y�m��/b������zQ�+�s�Z�B�}�5��*����������
�����#�D�"l�l���,�����_	!����Y�#�)@�	����N��{���a@9�����*��\�V��.!D]'k{�sp��(bTU�����se�Vt�D�_���w���0'����D�E��O��w�@�
�����7�Y�V#�����9E���G��.{nd���|���-TowS����n����=~OQ��m����]G�������I$���PMm=��q6���yn�;�l.lwA��O��G��K�V�9:/�>��z;�(����G�'��?���W����'k|B!�&�wu�/�V�k|�k���+�� .��b�`�

@�@����Z=����O'76�s*9��d�&1���Yg@?��!����jS_����"�vn�~�@�����N�g�]�����m4jty[p�8 
����D`_�(�p`<l����B���us������&v^�EB�����C������8
�B�b;�N~�j��xIqy��7�}����f	!�ak{�s�>R(Bl�������g�4�����L���[l����;���Vkb����<v=V�)q���z6~������8jv���_d���
,���k}T�pS�#(�w������sg���c�t�&�������,�ZK$��3����2����U�T#H�T���#����h�n��P# ����(�{���CA��
Q!�EI��9�?-���.�PEf�)����EY5^�/h��k�'��}�
8M���M�*�������E9�x\q{�1��*��
!Vm���x#P��hE�)�>p����9l)�����/��W��6�u��W��������6F�w�DQ�������3���P���V�R����.�it0���\�JfA�v��y��j�j�yV��n�}}�� ���D-�^�NJ^�?���2I9zN�?�X�N}"-Z�LD�6��k���&D@��i�G����"�%����V����6?+^��V�#>MG��)l�����4���2"����9E����CY�,�)�Z:��!����|v��H�M�]�~n�6�]�����_-g��y����
�zZp�w���������������K,O2���P�UK\!D3j��=��LW(���\/��VE�����u������@Q�(����E��;[Erpm@Q�?/��[��v��	U(_�*�AM
7W�B�E)
�}�}�B|�F-��Z���}v/p���y���{B����o%���)�y���b; ���~����W����u�Jc��#%3��d,�#�t12Ld�B6!�:_D��z����-�f�_�Zg����95�{y��=}q
�������f������8����t��):��:�Su�����T�T}��q�:�z�8M"�H�:o�|!�[�[~�3!������#���k�K��r`;��BR�{#��1�}{���SV����4���*�<X(��j$�L���'Z�t��EJ IDAT�����\%�x��1���D���(��
�U!�%���AM-Lqg��
0u6�JQ�r�B1;���A?G!J��}
X�!�����6Q�!x��J�K�s��6b�[\}s_,�aO��p_��d�����yo������~�}����7`��Za&f��sA
��L$&6z�e�F����P��MM�������sb���t��b:.UG|�>$�[�vL�F���H$�(���K9;��#�j�
�U�y�sx�
�]��7�_^V�����y3f����~jf������u����W}�u�n�}U[������ht(�����Q����l�^��V}�n��}:�����v����KB��qq�QV�(���b!�Fu��by_9FA��Y���Kp+���(�`=qVp]gm�Zm�����m]��V8He'kE7�V�kv��!��/x��R�^X�i���mvm�K~�����"���J�4�Lz����i���C���6VxX�Z=������u�O�d��0�X���}��	!p��������M���E�w�M~�M~jv�t�NoTB��"������d�A_S�Y�D"�����pzl����yWv���z/$���6��sB��j����f������9�#D����:L\���4��thQU�B�<f�\w��_��=�Am6�?j���i�an�i�0l�U
�NQ��P}tv���+���'k[��������!������s��X��0/�9b[C���������,��Xo!9PGF�1���<:]�Z���6�|���/���VDW�X����Sxq*�S����(
�T=�S��.2��>O��
N���[����8Z�B�0�F���!8�P�s��Y�J;;a����}���a��}[?C9 ?��%k���bJ���g��V��������+^����Kcy�O�
�&u�!Q�*�Su��:��omV�nic���)�Yw*�)�Z�����^Zp���Fc�=d��+�Be�G�l;�}�o��C�����]��P�U�������^�$�hR��qM@k�Ag�[�	9���dm�����m�d-��:��G��.��#F������SC_*��-���������GR9�w�Y�J/������$���H��s��ON���[)�X�/}>��F�����aC�������}���qe�����?2^M�n}?6m6S�H�7�G,�zr��E��o��y�<>�}?��"1+���{���&M�f����gG��+���9������}kB��|�>>���������r)^��f?�Ga���b�W����0��^���+q���4G^/���y������U�]�7i8bT��:�8����e��y���X�}1[~���^=��-���4t~OH����|/���[����~�J�}��n_�f0�������y_V6
��R~���1�"�/X�6o��eeC���-[��
V���
�����O�J����k���O���Z�\���{0j�+��^�(�Xd���C�.77��7������������D���#�����c�
����%S�<��i2W�
�7����<]�3����8�g'aJ�U�!��>>�/�/�8|��}��|�[�����z#������2�k(�4��iqd�1b����z�D"�H$��(��w�Ve	���f��O	!�+���U(�v����Q
���x���9��o�������	����.T%GQ�jT%��n��A?"�r����:v��������o��q��,qmb�.�[��4�<�����/����+b���EA�M}���V�e��
|�V#����X��1PxQ
�NN@��"}(�*����%����C���/�O�S�O���gX7����!m��a���	��6�����^]����_ji���0LO�#c�9��c�$����{�D"�H$}O�#��������b�b�5!D�?�W%u.�6�xj��2!���m-j��v3����$>1���s�OB|�ii������'����q�f`�bU�>G�H��SQ�����{��;F�}�GF���>n�w���#��dk#�����r��siJ���<�r��n�x�>�QF�����X����>���nV�R�����{��~��(�8�Qs�dw����k%�{�����T!mE���Aam����������>�H��c�A!-���D��"�n��"�
�z�����(�#�/��������!�Q����_����a�oDmDV����m�������(������DM�v�����(�(���A�F_��*��k�E�����(o�v���B�Y���Y2DX�|5���G�0m'M���8��+~},Kd#�i���j]�Q�f=Gv�O�!�\��t7%�]����85�9��1���:�d���5$djH��~�W�?BD��y7Ux(Ym���7k/���-���E���{B�>yn���2�,�H$��D��� L@����Dd�q+)���|!D~'��EQ��#O�*�~!��v���Qo�Vw	!6tr�,T�|
`6�B�����^�"�t�r���#B�	E�5�\4�F�%���dm�}���(E�S���m��q��h"/�O&���{N�%6)�>�e7���3RX������7�����~���q=F���$1��d���$�P"��:m��K�1G���o,������������C�o���3*���c����FL}�H$������������_��xw
!;h�$�E
����
p�5���R���v�D�Wz��jn��|FK�}��3~i�XC���Abb�����j����F�������!�I9z�\�����:�#�uz��l��$jK[�����]����j�-�zU<���c�$����D"�H$}�`
�����o������#;�:<�X��H�}�m~�;g���<W�A����sn�������|����?�96����b6�<��o(����z6���������64J'w����S)8:^��K:p����}��C�2P�rw�����|T���u�[t���G�
�����Y��!�Q�P�^$���V$�A^/��2h5�B�e���#�g�k���d(S[����l%g��������?�����9���)�����g&0w��L����
NV�R��/m]�_*�p����S�9"���!�U��q��n;z+�B|���4=���!���n�}����f����n��=�>�Z���Tn���o����m_S�b���B	�@H$�D"��!1GY�Q�J�����-��,���=�?pF%�\zhb"�g:_�|����RX�11	=���_��VV�RO�FW���&
��Ld�)$��t�N"�8B�>����"��;������s�(C��9c�}�M��s�%�D"���z���k���	������x[�L�{)������V~v��p��sSE���;W���.%2����,g�^e����=���"�]�q����&��VOS���u�i:f������e3��$��(�OC���P.l����9��A�L���jK=��z�G���>���HC��\M~�r�\��H4:%��Y���H$��P#Z��7��B��w����!��>�O�R(>��]��n�*����U�<?�_V��]J�P���~��7#F���i@���M�����o4��[
�m�.�e�5Rxq
�N����5���>F��K����hnz;$������A�v�swH�ey��?�z_�p����v��OfA+0�k�zf�#bH��c	�1�5}:]^/��"�Io�����D��1����bp���#�
���>�y��p��6b�K����5����w�a\e���SPp='������nV�R��O���D�.2SxI
#g���M���2�4���j;�?-Bs��!�L�X���
���!�9E���L~a[����n�����Vk�[p��.w[���m��R�a���!!S�%]���:2����F�V~�H$����7ep���.���V��R�A!#��C  x��=��DKbwq|LC�v���%��Y�`q�]~~a�=G�|�G���1�&fj�c�*;�_���kG���z���&Pxq
����;1	��{=�����7���Pk�g3huj���b���Q2P�� 9,(Z��*�;�����s��jK[�$��z��@[�uX�	iK�S���D"|�%�z�OqJ�?r����?IH�<����v�N�XZ�o�J(�7Gl��us��6�<;��>��=�\o13f<I^^��V|��g����*�������%����m��0C(/�@ ��!! �!	 ZB=���@�l�
0`�{U�}����������W�J���=�����Yi����6�/�j��4R����:R�lrF3��Kz�c���31�����f���oG���:o���lO�I���a��"�)A��BN	����\�����f��{M�x�]�b�]�z�o���?�'	K<�C����N��}1��zk4�fg$�����]B����J)=��������Q�u�6����oX�A3����@{G����N������#E�]z��������C�����^sw��X�R3�^h���e�x'����'g�H1��D��	�����3����,M�[��^:������A������Cs%4W"Yz�c)J0��"�-Kt@L��(����k��"Q���ol�������Y�D�nIAyg�=��z�����/��6�I������K[��;>����4m��r��`��^�t
%��"���N��!���oY�W���gVo���&���-�X��E��"��Nn.B|c��
���wy�����5�pF�c"��_����'}������1��q}S^��6���N��wJ�����l`������q�i����k���[�����:��T=7�)Gd`�F�%2"���S']�lo�����zH�B�����������������6B����r��A�;6$������������n]P�<W�I
I)%��~�j}��yi���c�/(�[k�;�<��G���K���3����1�^����r�%�7�I�&7O_���+�������/���h4M,�TG��_	���`-��g)e��
5qE�^��g����of����`��X�h~+��\��?������x���&��.����F�'�]\�v;�9����9P_T�����6�����(�0`�1�zn�����
1RJ��������������"�9 =�s�{tz.��_��{v1��������R�����C{��$w�N�4(#���[
��V����y�r{���F6�vM�R�j7����-��������v��+��M�sKrJ\�x���>�F����$D��&��B9�4�y9����_��������S��?�����v+��s��ci{dD����m���d/s��@j�XL����m|�l#�_��L58��lf� �����Pz�����e�M�a����B(�l�%|�si��y^�vZ.$��������n%��*��6�U���J�t�Z���H��.�t��9%�]�\��W�J��H)qw����j}�Z��u���sp���!3)��i)�LK�d��LJ�(���h4�����^h���������Z?i6:��T]�9�r
��*����)t��a�pK�/9(3�9s�_6��_K�o���o���1vf� ��O�!9�����2��>��6�DnZ�����_�	�b&9#h�%]Y{I������������l���c<-�RJ?�d�1[BZ	�*h�-5*��P��	^��]�f��pS�������F���}��;���Gk���K{��vw�WL�s����xN(t��&�~���DI�XLp0*.��)�3qX�F3�t=�
���sy&ggMfG+�k�����d�_���g;?��A�s�G������j����O����/.V��n�_��&s��y�sl6���?�RB�e)X��6l2��>���~�RD�xD�4����Y���"���U�ij6���h�qi���NY��v(!�&�����w<�n�w����C�J��w�F��OT���c!w,����� �4�&:I�6
�mT�=�����.$��:?�Z���{%�_������96��pr�{k���h4��Ky(x��(9��R���E9�<��?�|�z���f�?���J�9u3)"�g���M\w��x2#����k���j��8��E7�����'�M��s�);$5�KI��~���7.��+T��XIJ�q�A�z��]8	����n����?����aVi����2�uw�5��JFA!=�7����T�lI30��[��sr����w���&<]�g�����
-�5�f����B�k��������U@+p��r�`,T��������>�+�x���)U�k8?l�_v��)�����z5�[�`�3?���H6�`���8��|
�����>��6-Wn�����/�?�<}9����P>Q>J�"?�cb����fY�w(ktS ^:`�������������GZ�hz(��"�!g��Q��jM7�_w�V;�=��R(�[�g�F�	$�P^
l�R� ����c����i���R�?�B5�h�_���9�:���Z��{��=�����^�'m\D�o��]�w�)�/>�8��f�8��\f��Kz�����+�G�&���n-CnZ��c��� �g@��"�$�P�e91��	���������JB�Hf��.� l ���hw�'��_E�0�n�c�����o��`�������G��n����\�� ��$S�wr�xN1�2���9����/���(1�����y��� ��B<	��BY3���K�l]�����^"���,����x�"�\&��ss�5t�u����k��	Nf��������	��Rn�����+Ulj�$g��C�(f�A���~=5�1���#i��nS"���.��R����?���R���#x�Z��5�}nus�~s���B�RX}	������{/��\~�T�&}�I���Wy��\{�k�yv&��`���`���g�WR�������D�������!�\�O2�Z<k4�h#�rp���/V=�n�)��������)��ZMmQ�/5�n��`)/���������|��Y4f���'y�p}����vm�D���pp*�������cx�8���`�-�il�j`��y�� .�������[Vi��k���H�����C�J��M���~�i��[!�E�XdR�zu��tL���x�Z�M{�nX�-�\2-��{��x�w
l�F��I��������X�u�
)������R���X�&-���	����,!�;�j����>7�I�n��K�{�;���$����������
tnQ���
�96�C���d���W$M?T���Q[��q[�v��r�.���qvQ��I�5Y�w7(����������� %�*L��Rb��g%S>��fh��k\T}�����r��)��}��EZ,k4�.H�|/p����j��X��^4�GJy�`,T���c����?�uN�>�������9��|�
������L��cj����T?�[\5Sp�L?-��?�%�xhKIw'l�,T�x�
p�G������>���FTX��� �3uW�Id�:.Lv�E�D����
�R��x
�h
MV��)*���d��v4x���E���'��?���O:����?1��^�=�h`��M|��E�_%F�^�]!D������=8�����%j4����������d�+�d��	S���{�;��?���om��������4��W�2���3��Y6W)+����c5�H��_����q��uX3l��L(��������B������f��n���/W��w�*�\<E���#��u��Dz��)sL	�y�
���H*������`_�f7:n-�]�sK�K�����J$k4�H��eMb�-��a�{-|t�;��\����������`�l�.����0����vL��&n��*����U���2��l�"[<q"kL�����CkmH<7WB����n���]O�_�IPX��]0aZ�M��K<����~�7p��BMI�pJ2E{�cz�]{:h4M	�z�I,�P�}LS���>�����E�g����s�����]����
���� IDAT�	*f
������+C��7�wg���X��-k��N�_O�g����	��D6l�EO���@�r<0�P0��wa���C	�UkT����i���\�W2��0z�k�h4�=���B�Y��R����Nnr�g��7�"5��By�y��:R.{��������$�xx�)!��W
�?�������l�a%���q�����Sn%������L��F�c}4�e4��~e�>�W�	"k���������W�1! o�r�.��Jn��������h�/CA{�7�����.�,�PM86�`LERP8��C�_�^����/���(1����'?f-j<�<�	4�	!�K)���*5�8�������\n�L~u�E{E�!}�5T%���O*Y2'��1�1�����r�J��ir�
h��}�v���[���Ad��>�F��`y!&���-2
0�|%"����[��RJ�D�z�F	gY���AGcl���B�V�W����������DZN��������Q� ��A��)�y�
��\�������.V����3���~���C�"��K����%����h4�$���!)�]V�ZT��I@%��-�<t���	C[�w�'n��a���#����|��� eN��'��K�>����O��6�='����ew[��z�r��)x�b_dz��)����������F����j�+�s�����:~/���@�'�,�9�[[3|��<��s����f���u.j7���v��~R�A����sAE�d��Q���L�������)��>��Rg��NJ���
Z(��/���
'�P��C0��9l+Re�
���U�5�*��������������"����E�QW3�����V6jK�)�w�5���n��
���JH,�M�����@,c����~SP,���Z�X,{��}����������w[{�P6���mj���Y���:v4�^���A����s��d��t�m�F��$�P���R�A�4��R�n]�)p��R`�P8����S�^�w�'��������������l����P�F*��q����Kd�D���*�n�8�a����x�J�����o�Ot������_��j7-��Xh�7��[4I��I`��K��/^������_���&5�\A�s�Z
[����^��v%��2o��w8q��+�X��E�_%F�s�"��|*��/�z0���[��d�7?��'JR���e��L�\q����f��N#�v�iy���"_���J�d������d�.%���z�3U����/�_z�J,����6����n��������[g��)��I�1��K�p�)��R]z���k�bv:�C��|n�����p�]�������?w4��h�`�G�>{�`��d
�$Q�>ON&)��^��b��h4�F,���wQ�����u}5�������R�mQ���?�]�6��{�;���S��d���>�1�����t��S��r�����8
���F��%����m�Db������f=x�\z�DSa��
Da���Z��F���V{{�n�Tzw{�����s�z�R����/�bY���6	�zm-d/�x�����c����I)
�B5�h�;���&��WH�;�/Y�W�,=v��#�=�����#�DXc�A�����F��H����Db7��o����	b!-G	�1�����r(�����&�(����v�+��\�����{�1����d�8��|
&&����i���%�5�f�$�P�$Z(����w&�����}��v�>2���$&�6�qV-�O�����
�	�D�!�������6:�G�_�^9�g�&w,�9G��Z���e��9�PX�Sc*X���y'r�jW�Q���4nu�����y�����>on_����������$��AN���R'9��K8S�>���E���(1������|!��<orP$ir�}
��<���v'L�b�#��8o�v��h4���������wn��:�
����u��n#�v#�m�����hs%4W"���������s2�,z<tz���avAAE2��wB����K�:75k��]��f����nd���	����orG���m�N��cN���bv�.e��h��X]��.&��%��R�[�C��(�O����O1��S�tr&'�a�P_���z��[�����'���/����F�I@�EO �����I��#?}�����	�6!��p������^W|��e	ge����2DR�N�:���:�.+qX ���B{:cP��E@F���R'�%!!hg�q8+�F��$����W��`)�e��R�(~����������C���~'�;���3�2����Y*��<1�����,3g����i4�Q��������Akt\kB�.�\���U@�X��g�/���b�r�?�iJ��t��5�i��asL����<;d��s�u��uz�]{7h4��D�[�j�h)eSoj����uE���������o���M
�o���������?������Qa���>������/���["D����1/�:���6���n�wd"����!�$���&q�nH����El~z���d�88��<\m&��Z*��Tyh��b�o]�$�t���]�$'LP�d��-�u���E�/���D�Q.��E�f� �d�y�P&�k�������}�_q����"Y�?�HV.�"�1�E�F����H���P2��7������Cv6C�fd���]��r��<���~�r�Z�|��P���S�K���I������y��8�������k'%������4a��#or�>I{��fK87Wzi���R��mu>�
��-i���as�Z��4#$���V;�4�hl�����V���g�+@�e�f��Ey5������]��?h����y�K�M"�.����gNv��=s!�������
����l��F3Z��	��a�9,�q;q��r�)W��I�+�%������3W������=���d����6�K�������?YEN�})Y�����^74���(��?B�J>@J�>���-�w���b��'H��|����9�NW_���<�����W�v�?�5o&|�'�F��H��D��no���%�E�@N���w�PL3r�t��T)W��0h�;9H��=qF*����*r�U� �:�����a����D�����'�M@�[wR�g��@Mt�P�9[���o�
�]v��<�����jX��2�~z�#��wES�z��������W4�0X�Ev�B�F������U�as,���b�0����3a�RD�">[������ul���s
��*�.�*��Y��sV�#�<��AR�.��_4#�D�Q~2���>�H@e���YV!��|fyP$���Y�H��7�J���h4#
�����^�D���K�y��P�	2��>���*����u��*h�B���u$C�����5:%3.�S3��d�H�L�xjJ�kRJ:���Tz�1�!�\�������=!m�$�;�4���9&)�P������*v�Q����Vi�f(���<�?���wc=�~�-���>�-��%u{����T�y�0<e��Z�;o]��dS������z��h4	������CIO�J(V�A�o�n��:���������[�l�F����K��}A!]�����6��0F|��w�UK: �3�B":��������}��zkF
	�z�I,�P�N�C��y����3~{0K�W�)f'_���	����fc���]<�K�c�=�8�3NA�������K�c�����2�F�h�CLJ)��j7 k7B�%�k7B���*����\�C	�DzHX�z@�XD+e�$O]���m2��9��l������h���Z���6�����D�����B���
���=������/�bY�$���F���+�i��G���z����H�e�'�"g�5�Er����~�������$�-(�e]=��OC�Q.�Z,��W4#���/}	�X��	! �P�b�2'�Z0�X +w����9���P�j�]�����r��g�]�b�k�a�:���E2ES�O�����������ON
��|;G_V�<O����p�D���/m5^���7�������F/�*�wT]�`�A���'Q����o����"n�sl�n�sg������,��3��@��v����qX�F3��.��	YZS���G���I���^�RP3��q��C��=�������5kq}I�_�d1m/lg�2���h4{
���{A�^�4G����mY���i��q�t6����+"�k�c�p�JV��Bz>����!]��]���I�����i�v���O�dg���r�����t6��p���V���5�y���_���JW���?5���^60�P���}'�C%�����8+��5��d�H�������]�NkF��(����l��Ry��:�;[h���}��do�9aB�Y��V��$4 �D�\��]�=)%�\���liC��Bk���~{7T�Bj�*����(����1��t��5�&*��S.��\����E3�!�rJ�%���<�P�5�C�yL��|A1�Z����K[u�2�����H�i��0�Fr���/�<�FR�1�e�t��!!b������u��@*p�����X�&-�C�./U3��^���tR&��a�M���������.�y�/#�:|H�:��RBw7X"W��E/�vk�ul���E�����W���q��q%�����q��J��a�%44Mb(k�K@�oot�]\��n	g�S���JP�S�Cn�v���;���s���y�C.�/���x�0H�����V�^��#��e:{t�(By=�����B�|�@>FJ9_aVoK)���j"�B9D��>��O!W6�����W��|G���u��E��;8F����f��:�H)��Z�X���9e��B7 �[C������w��C���()
�g%��sKT3�`��~����0M,���?�iB���{#�m@V~
�W�W�]b�!�(d��.����@z��}n���8�~IGCd|t���jw��������3w9��R2�����}��m�%/_������Xz%h�im)�i��DI�5��u��qH)}B����������[�D�+#���8i|P$���c"��)�Sn�=W�ec�RBGg�%������]��S�J1����~��J��J`y�1N'blI����b\)�d��Ka���F3��y�U��iG����W�����3�����7�R����0�����6>��M4�����f���nO
�!kt�<�Rw��	��U"c����N��z_�l���E�i��!�������f���&�Z�$�MM��e�j������a��LC��S��;� ����������)Y%v�wO
�$�=���::���c��#I`O����&�!u+X�W�����uby��E(���oL�$�z0�Sk
)%-�|��d��N��L
�g������z���q��Gr�\���Y�ft�&f�����8�#�K$;�������,HMA~�%��C^.��ln�|�5h��
|q���� 7mAn������(B:$��o����_�QY���Gz���������?.Tj�YG^��{�Xn�B6WAs�:o	�������,�
[������w�x��R���/��������7o^D�+����k�����FX[bk��s�bk���b��K����m~���O��p����`_����X�wkc�0���6���3�/cs��B:��HV���8����5vgs�}t&+^l����M_�� ����2)��V��RJy����K)'�j#��
|��v}/��(������������������2�~���/�J)���?��p�����Rn�26�5��������������k�����p�V��|&uV�����q\�V"����I�_C�&��NE���(����� �E����7�;l8������

^��	Y����k�&���#,���q4�i��l'oY����c��*�Gc���C���0N=QZ��/Ae5b�I8��p���G���
�iq�^7�TCK@L+<o�X�����>�z�d����]�?����r�/^���G2C����Je��V%�#�-n	����{�����6@��r��&����RHJY�m����;E�su�u�����@7�bH�����'J�������RJ��g�C�fk�D�F)�]���(���HG��+��U�p?��}1�/p������N`1p ��5p"p�)eD ���������������m6�t�e�3�1 ���~���:ll��l�����������0�B����o���A��������*fa\�j��L"��INp U�����|��#J�+���e	]K�fe�DpO��������x�2��.�������s������df *�e�e����]>AY�GX�1G���_F����ts��2�< �]����������Mi9*�wF"�2�Gf"#2T���|]"k���-^���m���S����U���OW�_��j��0�-���-���o{Oq�Y����E=������
�4n��?1��^Fz�#!�rP
l�Rz����sJ�-e<iv����R�W����B�E��=XJ�����\R��ac��r)��a�/'������}�/�W��g���9Jd�PJ�O��lA	�}�f!��J�q)�O�f��-�[�������m����cKq*��jY��9$�V2��4��#�'�t^����f�X�SR�V�nV�78&L����{��Y���S!-�W�������(g	�2����(CL�HM��{�^��_F��=�k��D5�����fT�)YJ8g�++|f���[���2���$�����;���o*������jZ��d��sA>6�����\�:�M�.u��%�@��m�{���>T�����/�!�
ey�DYr7&�-q�|(�<��so~��R.��yRJO��������w[}�7s��K��&Y��HJy��w�p�����x8�����f������4�RD������W�����jXl�s��8������?x��Y|�SZj�;rV@��[o��^�2u=�ADJ	M�;u��;�7�wq!F@8�+h����vk4��BUSh����\�l�,�-���z8I�	���k��d�#���:Y��e(�CIS���D��e��0�l���m���� �]{�6L���(h���
x@Jy�`,$��BY_�(��G��~��R�x���7����Q������@�M/(�t!VY�����u
LF�z�{�Q��QH�I�U�E�m,N���'���� B$s�	��g�=������-��>�n�I1�sCb6+���B������<�J������E#M��<�9�y+���L��.��/p�Fm��Z��ZX�������]Q�Qf��(�.��f�#�P��i90v�}3��	���k��]���#EI��`wBR:t5�`��w��G�&�������&��\�������s����:Y�! �{�`���9�����`Cv�A��F/O_������;9��R|�����J/Y���i6����;L��%>O���&��s�'$�}��P����)��%����B�\�c���S����I)�	!��d���r-�Wjs��hc-��660wC��<���
!�RJ_��:�5��t�m��LHr��{�q�{&��.����C3�1���O�����{�U����5�����?A~�a8_~�W\�HA�G��F<g]���5���H�������v��-��i�uNC��_����W����F�p &�
�t���a��	1�t���]�J��g��'�0��|�%����?\�+���}��#nZ��*��V�l���^�lo����j��N�h��BT;R�{wZ��5�0o?���Y���}��M���~���wa�?[��i��]:i�{mQ3�_�Ly�Z��2�yrN\_��)����a��A���G�J����s=�&���)GK^��(���i-����B�������r��c������L������8����=e;mw|�F��E,> ����U3��l?���;YS�����KWF^��p<��� IDATr7����H��E������#2��d���������/������]��������c���^/r�f�����\��'�v����8��h�F���H����������Dy�����:��4��)LT[:�n�W%������7��������	^�3�;{�&��O����Y���.59%����k4qg(���v��n�9�j%��jS���yd<�
���"���B,��Y!�9�1�����V�h��1�����
�/��DW]j��������� \peeedggs����-\�`T������]*�h�w=�I6~��#��-cr�v~\��_�A�{,G�w\���?�8���dnCX��G��|�i>�������1��?b������y���%�zFL����!��f�u}�����]���F���x����caK#����k/^�^/GN�@n��������a���-,Z�\�`�����o�j;Tm���;�zF:����"��{�h��������8r�>�;�h��m��~��8%q~��p�y��%�zt;Q��q����'�h����|5r�Gm�%�4F>��y���w��_0>�;�w�ho`�������\h�g������y&���p���H4������y�������2 ����Dz>��	��,�P��;�TH�f��E�����Q�.������r�t~���	������&����n����mc0�i2/!�x��q�b0U�Uk��x�R�E�dY��OP1��[p���(��T��%��R�[{�s,�?�2)�_��v`����(��
�$�����?TlqjO�k!�`����j���*5���;�P���B��5���Q����l����G��p�T�>q<��g�����G�`"���!�{'����<�]]��WL����`L*�7���)%��bn�����a�^�fSq���8_|�������hF%��d�������:h��:I�3��:d�;�)E�d��4�a�g������+����y�@�@@�G3t�rKAYr�m=zr���x��|X�q��RJS�)0]����kf�X�e���,T�e���:�{��ZG�P���
���_�w�{"��K����p��lR&�?.����WB"Y�?�K$�,i�q�Q8��w��k�p����8M�#���"l�E0gV����c�.�����}��V���Z<s�
y�sfb6��Y�H��L5
����/��W"��M��������u���#���=I%+�{U��xUJ�tC�F���w�tr:d+�l	�Ha]�h�+�;�w"�Mb0�1��v��R�
�:vEp���������R�N!���B���j����,�����)��/����*u%aB�%���[T	!��RvZ�8������Rv���=��?�@X�Q~��t�M�K+�j��5O�������#��.�J�������]q	�{~������"#q��p����}-�^����xVi3 �7n��I�K�0_���U;)	c���p��q�Dn|��h4M�!���,H�B�f!�u��V6��.����=��(�U*��R�8��uu@�z�Y���N���b����%�����:��F�����	�#�|~���������l�W�,�^�>�{���>�IR�w��:��P���NN~'��u��|�9�J�����������{(q�
�-�:��RVd���=����������K;��o������/-���TY�-���q��[�,S�I��v��%�
�{�h)%���Y��b����wU��"���8���2qkwm�F3J	��de)$��w�TH��6h���jdK�un���j��t�[���`�1�r`�|�-w,���AZ���{%i�h���t��B��6�8U�x%��TO7h�Y����@*��#R�G��5P�K�2�%�o�R�2�!NnF�Jv��I){���q��^(K)i<�e�K����vr�#������������z�.�H���]D���x��/�|�\����rsp�����w��$&������p���U��^z���6��e�.�1g&��fa>q�����F�U�kr0~��5��2LLW���C������jP'�������tU:+)5x����ia�KG����o�]�5�c��������PJ�L���{�P�~��.}'��'�������d��/�>�	]��3���*c��M<��L'�B��h�(^�j���9m����_������a.^�b�c%5c����y�!�6����l���W���R����s��;h��������{�%������tnm���'�Dur:8��A1�z�5N���|�xxh��L����$���f�1[\���#�o��y�1�0��e��A����G���v��~|7��k^���p��1Dv�`._��#��8%XC@�) i�k�����0��+1�,�\�s�R�kvmu���\�s�"!b�i�8�931������h4���2��c��Q��I�T.�-���e���R���9�$����
nU�:�J�ke����Ldw��h��zK���X���8�W�LA��{h��eP>������9�����<~����7TY�� ��O�ca��3�-�mW������m����cK��d�W�6>}�G8�R�1nZ��.�y���/�q�ou�.�&���v�OV���'+cO�q�J0���������LU��h4�x#M�JL�F�T!k7��WU�0a��^q���)����������~=r|�g
�0v��~�'Z����:ac��dF�P������ao���+��������3��K��
1����������e��x�.�?9(�����H��j5�����,�\�j�b�(#��C0�X�Y�i��8�h4Ml����Y���#�{as��\HwW�wg�Q�;#��_�W)���#�R"�s�Ex�.}n�����MP4��Wz�>Gr"��s-�5��BY�L��z�U�gSq*G?p�����Mo����3��q�C}'�������{|�.��/��+RJ������x)�G�����}"�
q�~sfa�3SY�����N����-������a���RJ�v�+ �C�z��O���DKX��Nd���P��i�H��!w���v$��pG8R�d���O�9&9�_�	#�^a���p�(k4CF�c�"D2�
���9�v���Qu!�q����-<���w���&��]�+�h4��Q^�Q^��	�lj��xyP8��>�.2������_�9�?��Q^�q�L�@v���{���~�W6���1��F�T}dz�����+_�[��"d!����6�s��4��.Kpw)�&�e�E[����WUfq�S����J�e��
�.5�����:�X�5��{�	��������H�3��-"�vO��b��������>	q����(f�Ey�2-����f<��J(���".���������y[�@\�2�WW����^s��������]�@����~���Bc������l��me�6W|���["�zCdY,��wi���h4{�t-�iI�yB���v�2p�>V�_���,�����H���^k��(��������;G<2���$����^��:y��4���K�����"��N����P���u1L&�o�}"��6�v!���|�ED^ND��p��h4��O"%�����7\Lw�\�{����"������!n�Z(kz1�����-���zD���L���`7��x�B��m��
���#?�4r�MY�.�`�V=r��a����{E��a~�s�2�K�"W����l6�����k�`r��*��h$�~�$z�hba���h-���Z.~�/�v#�r�J��_�8��*���\�}nd���R��hm72J��mk�1�BY��4����������>�������W�������-�5�"'��������;4��h4C�(������v@vua.�,����rhm��D~+������������,����}��k��F��$}	a�Q��"@~�Ft������z��x�L)%�<��tPP����(�����+��n������PsB��p�Ey�2�,�w|L�=��m�������2��Q����O���o�
s�'��:i�F����\�MD����=����T����1��,�II�_�F��h4��h��C��r#s�v��D2Z��o]�s�������'���������:N�Y�Y%��nF��3������K���F�������n��Z6Ir2����p�;������.T��h4�.��P��L3lH)i����\����?Y�O���j�"=��H�l���8�����`������aT����AS�J��a9�=�y�zb�].�>�w�}xN�>��I�>�/n��������}	���/�AE�M,�����q��*�Z�[�`�c�5������WE�������9H�ts���_,����vi4�~���^����2l��3_���)bLA0�6���<�&��/��%�@L.�8sg#*&�a�F��C�:�]�G(#���lqQ;�i�w�o�y�}����������/cn���g����������u�p,[���0|?���Sz���u��_z=���4M����~����� wT��r���L��>��&�h4�fOd0]��P��t��v�|���*�v�
����l.I`���,}�l�>�/��1�����]�fH�R"�n���R��� �^��'�$3���a3�#����`�F��h�0�P��b$e���{��{���Og),L����������Gn��#U����aZ��F�����Wv�lh�D�R��?Q��%�����1���p�3k��Z��E�_�^����/��2�BY�(k��3i������(��O/�/��:3�����DnSI���������h��������z��������Jeu^�������I<�%�TL�!�MS�y�ll��F����h4�F���<B����~N�
D��u��|p��B�����w�&������~���?��j4M�H�W%�
�9/^
�M1�#&N��;�*Ku(bJE�a��`k4�F3�����^�D�����n���!7���[��~�����������{/�_��v����T�F��RJ�7�#�m��D����f�s������^~}DVoP"9�����]Z,k4�f�����#A(w=���S'c��|��x��1x�-��N]n3�������)�������v����/z�>��J�l�g����O���8����P���:�������z0��E�_�^����/���c�5#���V�~�"������O���.B$,<(?(�
��'��\���]��?�E�5�f80��b�}:�}:�����!��������$��E�������v��P�������*QXN";+xNj�����h4�N��J�[���.6�YuNlSr�w�aG(���a0����[B��O_��
�b��>�����l�F�vdW�����O0?Z��}���9Y�8�����;����;��]O�l�F����z��E������������qMg�z:�@n}[}�����q����Z�F�I\���\�����/Y���R���E��BvfH`�t�1'�<x-+a�|	����l�F���-�5�Ht��������>�6�p� ����U�5��l���M|���q���p���>����J�#�D�����{�0_}s�������"F����i��!�EN�0���A��JDE����1�x�����`k���l����/���c�5#����jy���!�M~wb�=c�7|����Z$k4M?B�rQ�����+x��)�o�
��������r�1���.hiC��"�[��
\��{RBk��
�n�?�|��-�:*����q��1���wo�����F��h���(�P������Ww.��'�a�H�����{wW]�q��L�$M���B[Z�Pd�Q(\D����
WPAeS�*��eGDdQD�7EY+;e����BK)�I��m�y��Lrf:Ig�Lf&��_�yM�o~s�w�����o9����Y��s?��_����<�TD�8m��d���U��Uk������nM�W��M�--�3��Vb�O"��$l���M�Dl��0j�h�nLC�e��(���'�y�C[��Z��_/�W�1����7��������=O������""�+�]�{���������U����\��D<5	����6���?������%�����m�M���)���(�
=Q���a�W$�M��
�a4���U&-Z������g������V���#�R�H66%^����&�\�t4�^
������`��Y0l�3$���%6y[,���!�;g���[$����(K�����$��Kc�v�V�/������?~�3.��$YD���^�`�@l���%�����~�O����x����E��_������k�k����y�y��A���	������][=�*������"U�=���������V���z�.��v���Z���)h��Yg^��f��H�tv�Y_�O>%���3����!>�=��9�w��Q#��p'�i���s�/""�D=�RT�/JJ��7=6�pN+��I�W������B)t����s�m�P���?y�����I�!m+c{<�/Z$�3�����������5f�����c��z�m����9������b�����r��z��T!�(/>�Nz=W����>���wn����s���+��E��
��#�R�H6�1^��_�>s�w�����9�Cc��w�1f��c#����l�x������U1�����E2�e)M��$%��7���-��C��^�t�Q�~��|��H���l��`��(9���ron��/�g�&��{m=�s�Css���g�����<�&l�M����F��g�G���R���I�m?�����C�M��"U�=�K��0��a����
��_�	����C��2^��64mU����������"""������9�=����3���>�x�s>��z�@C#H�i'a#�s��V���n�MeVR�9m�dZ."]A=�RZ>����(�-V[{�kZ��������x�����(F��
W��OcEDD2`��������o^�b3��.��6V--�`�JZ��b���
�W�aR����O��oeRyR�^�i�Z�|�-�-�,�HAS�r�*��>M����n�U�����������>�UT���%�����������=���H�+�
���|�:|���@�q��������:�HWVn<����o���o���k��=�����'��-�&�"�tn�t=m4�z����W7������M���qx����s�g>�*�9�C��$YDD�%���v���n;%�{m]�pX0:�����3���������Y���$����~�di��W�P�����	6�
����
	�
,���"�N�9:�z��T��(�^�
�.~�u����=o�
�%�s�l����r`[(""R��6i<��]����_^	K������?��������:�nm�\[�#���]Hb1:I�#'�I$���K��H:��#7�
���h�(KA��V\����[CCi	��q���XI���H�����^��:������l��^��a��0�NN�S����;J�k���_���qXZ�/���7��"���H��iK���$�C`��N�����[u|�q|��kj�5��zM���>'�U�����������24��s/�A������`������*,�����e�l��}���m_g�Kc�v�v}j>��bj��&!���L)V$��M��O�P�aC){���/�-V������rwX�.m���^[�a�y�u��	�x#�[��72��u�j.�xb������sjo�������L���>���/����8>w~|ZS��^�����KNtk�%������#��WC����d��X�0���L�����O�:����fj�G�\P�,�������D��v�(V�/���\5n�����ODD��wn���-�2�`���Jl��M�O����&Pz�-�z�)�����`�pJN9���55P��^�/��U�;��R��P]�Wg�[X�k IDAT]Q�<�;�����
������>x;TV�x�T|�����q0^W�~��>����t�����}��6i|p|M�}��x���m�����A23�������m���������i�����:n����M$6e?����5ui����hym���u�w��|��a�
|�
������"L�#�tk�u��z� ���q�u{?r���(�B�����BV��`�vK>����pd;?��^��������%.fS��G����:���5��dviM�@W/����2�uSSW�l�v���m���A]K���u�0y���`!44�0�{����AO����&�iz���y>>w~��%6|X��(+Q.R��(/<�6�_n����}������s���8S��V['"""��s@�=�Z���iM��^�,���H�W�����Rf����_����o�����Um��N�%������[Qz�mPZF��S��?��M���57��W��W�������DYm;�!
M,FEK�eIV�r����8�oIeG�fO��8���,����U
�S�$J�%S����E2U���[��]&�55������1����r6��+UT@��XU� ���l��n�����l�oK|++�ZTksFsDycc�@����
�i_�2��c��������i�k)<�W<�@/m?�7&���i������������DYi)������g��7
��I�}i5�E�����u�����Ayy�����I�B{�x��n�<ckii������v��������m'��MII��s"��&��R�~��������i��������;�Z��� �^��{��|�m���\�����������D,�v�=���y|�aL���������-��>�\�������y�Dz���7L������o�6i_�e�i:�<�����c�}��PYI�W�M��Z��~���$�)=�������|E����,g�u/&%��������B�sY�������:��h�>�eO���L��x������bZ�hS�����$�)�{��n��Wl��F��F��h��q���&���m���v�Y�t'i�Z|u���J*����x���?K9�����e�������	R$+�
��d�;�J���F��~�A�cbxk&s@Y!_.���O6M��8�y��=w����GY�����)mh���z@)�0��KK�!6xJ�'"""�c�x�����B���\��"��9�����I��gU[�����s��[�����?{KbC�uy�DDDDD�g1���Q��k����g'%���b������$���d)zJ�%c�����N*������_�������S�dc���0�-��dC�"�R�H6/R�"Q6��f�33{����l���af�YE�����3[afuf���������:3[lf��l��}���13����	�~df��kCX�p3{!l�r3�����S7�6�K����m�wK�p��w.�������:��Qs�����Y���K@p�?�����^���4Wk�3�������~K�����k�����q�%�~IJ;�������N6���{�7����y@���/�������t��y��~o��n?��p������*�6�����="""""�s�r�r�$��s��6�������������������x�w�m#�?�m��a�=�Q�w�(,�����'F��C�d���w�e����dw_���������#�6G����r�[�����������[���_O��_��em���y��k�Ir���y2�&�G�	g������f�G��_���F]
��)�y-�&`pJ��`$��D���-`:�E3���m���/},i���ykBCf�������Q�1���L)V$���bE��x�BP�rF��K���R��4u_�w�`�1�+���7��}�D����gD+�{�V�z�.��?���q�����5�?�,���c�p����5f�<�JDDDDD$7�6Q6��'���o>/N��D���y P��n��.��M�{��7���!�^�,��M��Xr�tb��P�;��'w���������c�$SS�L�w�H(V$���bE��x�BP��2���������,��tC���)u:���]��b#u3�����r��
���%��x�X<f���uz��!O-��^�Rx�E��nt��"/%���y[yJ���&���l��tP�3�w6�H�����Nc��q0��w�����������e����7�EL:�{�lA�{�a���~g����vn�����������v��Bi��{[���L�e��m�v��P�����~��7Y�j, ��b��(3���������>���/�����>�8����(�����)u��������������H~mf���}x��#����z*��/���=;�6����U����'^C����e�:y<���5?|�9.�vz��B2}�����HG+�
��dJ�"�P�H�z���	�$���$9���}���w��*������]��)G����{�����>��u��������������$ymy	>t}g�������&�&���dJ�"�P�H�+�
����I���"�$�vw�j�:�^<L1�#���Np/�W"o��`���Svu�"a����`x�y)u��wD��,No��h�N�����Mls�pw�/{2����G��_oN��u�l�[W7IDDDDD�KE�lfgO��))��#�D�c��������%����)��	x
���.7����>�8����Q����~of��u�����_#u��oc�g��,3�x��VV?MiG6m���O����������T�m�_�L>�$�!:�G�#����E2�X�l(^��b^����nK��t�	w�of��. �G�k�������)L�L��s���i>�<`A�@
p-AOww����~L#^��Cw_�R7�6w�E�<����(��;������/������$�.Qt�yI ��y5�����=�����bnI�����;��������.��O�O�~~���=���3�R�,"""""��eI����?_�Tv���([���S��S�dsi��dJ�"�P�H�+�
��%�����=L,������]���o�C�����e""""""]Cs��T.�(�W��x��(ml[����L�����d�����N�<�M�9��%_�DR��t`)�0��_���=��c�DDDDDD��e��i�uVR��GlI|�s��C1��5�E4�G2�X�l(^$S����E
�e������+o�^[^������f����e""""""]Ks��Tg�Q��3w��PS�Zv��[���&0��pp�|������Hg�e����NJ��c�MG���^�����+�-�zJ����<����~��������������H�+�
��dJ�"�P�H!P�������a�+���?f�{qc:$O���Q.R�5G����1������>3��~������������Q��h^�����N*���q���&rX�Z%"""""�_J�{��.�������-+�>i7��������H�+�
��dJ�"�P�H!P6�C�W���������./|�G��V���������(����<���Su����O��������y�~�1��D���e�T^�L�{�$��|��L������^"""""��)Q��]�/*��i��+/������k��^yl�����H�+�
��dJ�"�P�H!P���x�Y{�[Iew2��o.g���S�DDDDDD
��(�M��\s�K������3���~�����N9�3�("""""�3��,�f��'m?��������/jn������(Q�Qj�}��&��p�X�S��.�S��+h��dJ�"�P�H�+�
��%�=���H�~v�A��i���57YDDDDD$As��T�s��V�b�;�E�����pv���	�h������H�h��l������$y��}����S57YDDDDD$J�r��l-#f�M*�������'Xi�v�%����H�+�
��dJ�"�P�H!P���}�
�5��S/T���}����c�DDDDDD
��(�L�({}3�������Z�~��	��ug|��\6QDDDDD$g4GY6���nIJ�k�����*N��z�EDDDDD�Q���y�i�wuR����c�`��yj�����H�+�
��dJ�"�P�H!���*R���F�(m�n*1�6i0���c[%"""""R�4G�He2G��.c��m�Z�{�HV�
?�@s�EDDDD���r��z������KJ�n�y����d��h�r7��e�&m?�� ��UMI��yj�����H�+�
��dJ�"�P�H!P�r7T��\�?y��[������S�Z$"""""R<4G�Hu4G���o�k��5�/w[����U��)�Q��5,��33�mI����������F�������Q�f=���7�%��.�d�rzU
�c�$�4�G2�X�l(^$S����E
�z������Lz�?��Zv��Fq��{��Q""""""EFs��T�9��|���w�vm�~|J��)]�<���e��xK�-^��Z���o$7o��5JDDDDD�i�r7��wnd���$���xwT����c��Ph��dJ�"�P�H�+�
���(w��N����=�q���M���(���'/����$�~�IU���S��4���e��������<�i��+O�)n��\�^��q�}R��Z���<�H
���H�+�
��dJ�"�P�H!P�\���Q���#b)��G�lJ��(�Q.Rf��O�[���OS�����e��rK>?��<�NDDDDD$�r9GY=�El���IIrS������$������l%�E,�����{��k��Z~#Ms}$S����E2�X�l(^�(Q.b���q`������Bs�����o��5�]�E��Alf���k��������Qz-"""""���(�WW3�s���o�Y���#*���:��}>�������g�y��G���""""""���`f��s�{��w��������}�����/���|���G3wD?]��w}����b��>�)��dC�"�R�H6/R�(�����)�7��S��)1�:����O=���Fq��>\t�9k��7�|3�M�"�X�l(^$S����E
A�|7@�hfD�����
_���O=��O�x=�yV�Z��&H�P�H6/�)��dC�"�@=��a`��7�ym10����������HP�\*��v^����$,�w�H(V$���bE��x�B��C3{��#��v��@��7G��'"""""=Z�n�����`[3��f��(�a����\������HO����aP�-4�r`g��|4JDDDDD�'R�\�8p^J�@��.o������H�9���������s����6��D�r�0�A����q@
AO�E��.�M�Q4��@�{���t�m���������I�����;f6�����Gfv����Q=����y���;��0�fVgf�����h��������6��X�p#��2�'�l���6�G�l�v�naf��Y���3�W������Z6�bf�vp�9>M�23���}hf�f6�������h2�����KfV�/�0��]�d�}cfUfv��-�of��7sT�+����]�����i����n$<W�af��l���
����F�S�K�-��*.W���l|���v
�����R�����m�4�k�����s�'����%~	,^��@H�������F0
�Y3���gF���W'w��W���\��T��qJ��W���8�x��90���4V����Y����	�c~��������������,^z-08���
w�K����se/��R�^K�o]w/���������lO2�����pnqw=��L���)����S��F=�q�O�����j�HY%�������Scb\�������FF��V�N���0�������	.n*�}�z�4VnZ2���a�LK)�<,�'���G����/M�����#e�$SI����-�}�z�<^.�6�o�����N�M�)��s��^�����)�7�H���ff���o;/V��Lw������#0������J����L���x`w��%���
lf�#o�0�����$GRD2��0������/����Y�m}gw��7��Cp�
A�)�7_�\�D]
���9G ])�xIa�������9>
�@~�-J���@A�O+wo�
_����/�5f����5����wJ	�?�z9|�=�m���8o��
`f#	z�_j�.(�z3[M0
a��=ff{������/�����'(V�������9������]�7��1��+S+�Kj�D�MpnYof����i��z��
��bf����F�s�?�*]~n��������������z�{s�K�o�/�����C����%�b%U�lT�*'��P������J�3���g��pO^�`����|�b�[0���
�������+����{��-G��m�/+	�����%8��#\���H]]w_g�%NX������]~nQ�\<*���S���k�#����N)����M�H���K	b��P4~�g�&.C=���(��!3�+�&p=01�����+��������������+�����#]������{���D�c�UfvO��?�z�;���%�
>y���-z]<�e��VN0�@�[��i+&��&b#]�����#��P��}����f6!�����+E��~�
����E^���J��b�� ^�r��
�S������n����S����_�
���.�t��E�r��bf���6�`���a,,!������$��
e��-��PI{���#e
��b��������q�����9W����kfe�a��"��x�H{�]���������.?�(Q.3�`�h����{5�����h��x�`���i�'�n+�z��b(�����2\{1�O;uA1�S%z�����F���hE3�D�R����"�Vw?=M���o�U�_v
�y�g�����#��[t=�s�!�oy8�(Q.'�h=/��� ���[$ygf��y��_$�{]��3�1������\w%������	�(NW����N�t���[��1�##uK�s	~��'�-�YEx!�Z�A��r�#/�>�~g%���U���"���vw�j�:��}s'�\�����<�E���yG ])�x1�3�JS>8XF��W�������PF�v ��F>�-�xY���]K�������v����A�l����]E�����"���Xo���mC�KlpPK��28�����Kn���c��s	�xe�������A�|Lp?d�3�/���;���yp��O�i�?p��������4V�lg����	V�_�|hq���,f�p$p3��h������������Mp���`����O����n��7����Z`���������9<,��L����[���:� ������)���p7bf�#�����
8	��$�����w��H#��C�$F��n�y����$9�����@i���|����q���q�4-�#��T��{O�_8k.<vng�[�5a��
���c�#����g�	\�Llg�e�[>����S,��q��I�rK������~��7@AR�8����Y�>f=r/������+�s�b�Ewog���F�QI$��	��\�NS���-�Q��e�%�"""""""J�EDDDDDD"�(�������D(Q�P�,""""""�DYDDDDDD$B��������H�e�������)e��Y<_m)J�EDD�	3�0����Y3[nf�f������N5����x����\����.6����DDD��DYDD�0������:�R��
�7pKX��3�>9lf�@�"@�����^�n����l3�<��w�R�L3����;���7����<|���H���"�IDAT��,""R�N&W�I�p�W����v��e3if���Gf�`f���F3�R�b3���D3���>6�z3{�����|n�4|O<:g���lf3�l�����|3���
�������&R����H�;�`n�:a_Is��lK�E�k������L�@3�������6��
P�<`f�}!0�pp_��?��[�g����-������N8N�v)Q)~;k�}A'�+u(�u@	����Z��n�%������������O3�o�{��=H�(���My�q�� w��p�t�KDD$sz-""R�����S3���lH�,$�]>$�[��n������3��U@%p��i���t9%�"""�o
�/��D��|:P��1��}�)[��s/%H����3�������"""�FC�EDD��L�sf�������K����`�q:����ldr�yf�=�_���&�3����%�"""�F����H���A���u�~�,�U��Ou�~!e��
^to
��f��.pN'�EDD$��^����?��3���U0�����������r����f�W�}�f���.|�`8v;�|#|���'""�1�(���9w_ofG��>`f�O�	n�t ��[���5�P���3���g��v�M����&���Mh�r3��df�	�<���#�cf�2��E��4 N0\DD$��(���t�>��v!��}	�z�2pg�-l8�y�2w���v~����}�F�}-����������:0� �8������EDD:��w8EHDDDDDD�G�e�%�"""""""J�EDDDDDD"�(�������D(Q�P�,""""""�DYDDDDDD$B��������H�e���Zw�5�N��IEND�B`�
pinunpin-cas.patchapplication/octet-stream; name=pinunpin-cas.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 3ae2848..3e70792
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 95,106 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 95,103 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 63188a3..2ebd4a9
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 51,57 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 51,56 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 126,132 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 125,131 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 774,782 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 773,780 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 788,795 ****
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
--- 786,793 ----
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 807,813 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 805,811 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 885,891 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 883,889 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 939,945 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
--- 937,943 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1001,1024 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 999,1024 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1202,1208 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1202,1208 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1210,1217 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1210,1217 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1232,1243 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
  	UnlockBufHdr(buf);
  
--- 1232,1250 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	pg_atomic_fetch_and_u32(&buf->state,
! 							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
! 							  BM_PERMANENT |
! 							  BUF_USAGECOUNT_MASK));
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID | BM_PERMANENT |
! 							   BUF_USAGECOUNT_ONE);
  	else
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID |
! 							   BUF_USAGECOUNT_ONE);
  
  	UnlockBufHdr(buf);
  
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1267,1273 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1274,1280 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1286,1294 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1293,1302 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1310,1316 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1318,1324 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1329,1335 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1337,1343 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1344,1353 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
  
  	UnlockBufHdr(buf);
  
--- 1352,1360 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
  
  	UnlockBufHdr(buf);
  
*************** void
*** 1381,1386 ****
--- 1388,1394 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1397,1410 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(bufHdr->content_lock));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1405,1418 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(bufHdr->content_lock));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1412,1418 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
--- 1420,1426 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1454,1460 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1462,1468 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1482,1488 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1490,1496 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1495,1517 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1503,1542 ----
  
  	if (ref == NULL)
  	{
+ 		uint32 state;
+ 		uint32 oldstate;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1527,1535 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1552,1560 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1540,1546 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1565,1571 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1554,1564 ****
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
--- 1579,1589 ----
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	pg_atomic_fetch_add_u32(&buf->state, 1);
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1594,1623 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		LockBufHdr(buf);
! 
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1619,1658 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32 state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1635,1640 ****
--- 1670,1676 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_to_write;
*************** BufferSync(int flags)
*** 1675,1688 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  			num_to_write++;
  		}
  
--- 1711,1725 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
! 			pg_atomic_fetch_or_u32(&bufHdr->state,
! 								   BM_CHECKPOINT_NEEDED);
  			num_to_write++;
  		}
  
*************** BufferSync(int flags)
*** 1721,1727 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
--- 1758,1764 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2081,2086 ****
--- 2118,2124 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  
  	ReservePrivateRefCountEntry();
  
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2093,2102 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2131,2143 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2104,2110 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2145,2151 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2256,2261 ****
--- 2297,2303 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2273,2284 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2315,2327 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2333,2339 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2376,2382 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2356,2362 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2399,2405 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2424,2430 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
  
  	/*
--- 2467,2473 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
  	UnlockBufHdr(buf);
  
  	/*
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2444,2450 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2487,2493 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2532,2544 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2575,2587 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2638,2644 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2681,2687 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2736,2742 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2779,2785 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2778,2784 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2821,2827 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 2874,2880 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 2917,2924 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 2895,2901 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 2939,2945 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 2923,2929 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 2967,2974 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 2975,2981 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
--- 3020,3027 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3086,3104 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3132,3151 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3109,3115 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3156,3162 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3149,3156 ****
  		}
  
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3196,3207 ----
  		}
  
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3170,3176 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
--- 3221,3229 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
! 
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
*************** UnlockBuffers(void)
*** 3208,3216 ****
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
  		UnlockBufHdr(buf);
  
--- 3261,3269 ----
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
  
  		UnlockBufHdr(buf);
  
*************** LockBufferForCleanup(Buffer buffer)
*** 3304,3328 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 3357,3384 ----
  
  	for (;;)
  	{
+ 		int		state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
*************** LockBufferForCleanup(Buffer buffer)
*** 3349,3357 ****
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
--- 3405,3413 ----
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
*************** bool
*** 3393,3414 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3449,3474 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3416,3424 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3476,3486 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3456,3472 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
--- 3518,3534 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
  		LWLockRelease(buf->io_in_progress_lock);
*************** WaitIO(BufferDesc *buf)
*** 3494,3499 ****
--- 3556,3563 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3504,3512 ****
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3568,3576 ----
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3522,3528 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3586,3592 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3530,3536 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
  
  	UnlockBufHdr(buf);
  
--- 3594,3600 ----
  		return false;
  	}
  
! 	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
  
  	UnlockBufHdr(buf);
  
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3560,3574 ****
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
  
--- 3624,3642 ----
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
! 
! 	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
! 	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
! 		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
! 
! 	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
  
  	UnlockBufHdr(buf);
  
*************** AbortBufferIO(void)
*** 3593,3598 ****
--- 3661,3667 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3601,3626 ****
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3670,3693 ----
  		 */
  		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3644,3650 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3711,3717 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3700,3702 ****
--- 3767,3800 ----
  	else
  		return 0;
  }
+ 
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			pg_spin_delay();
+ 			state = pg_atomic_read_u32(&desc->state);
+ 
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	return state | BM_LOCKED;
+ }
+ 
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 0234572..8b64855
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 594,604 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 626,637 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 3144afe..c62a6f2
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index 19247c4..3d3e0d5
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 20,48 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 20,62 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * State is:
+  * 10 bit flags
+  * 4 bit usage count
+  * 18 bit refcount
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 137,148 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint16		usage_count;	/* usage counter for clock sweep code */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
! 	slock_t		buf_hdr_lock;	/* protects the above fields */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
--- 151,161 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
! 	/* state of the tag, containing flags, refcount and usagecount */
! 	pg_atomic_uint32 state;
! 
! 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
*************** typedef union BufferDescPadded
*** 192,202 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /* in buf_init.c */
--- 205,215 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /* in buf_init.c */
*************** extern BufferDesc *LocalBufferDescriptor
*** 211,217 ****
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 224,231 ----
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
pinunpin-increment.patchapplication/octet-stream; name=pinunpin-increment.patchDownload
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 2ebd4a9..32c0209
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1503,1540 ****
  
  	if (ref == NULL)
  	{
! 		uint32 state;
! 		uint32 oldstate;
  
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
- 
  		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
! 		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
  
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
  
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
  		result = (state & BM_VALID) != 0;
  	}
--- 1503,1524 ----
  
  	if (ref == NULL)
  	{
! 		uint32	state,
! 				increment = 1;
  
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
  		state = pg_atomic_read_u32(&buf->state);
! 		if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			increment += BUF_USAGECOUNT_ONE;
  
! 		state = pg_atomic_add_fetch_u32(&buf->state, increment);
  
! 		while (state & BM_LOCKED)
! 		{
! 			pg_spin_delay();
! 			state = pg_atomic_read_u32(&buf->state);
  		}
  		result = (state & BM_VALID) != 0;
  	}
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1632,1638 ****
  		 * currently all manipulation of ->state for shared buffers is through
  		 * atomics.
  		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
  		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
--- 1616,1622 ----
  		 * currently all manipulation of ->state for shared buffers is through
  		 * atomics.
  		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, 1);
  		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 8b64855..9331f21
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 314,320 ****
  		{
  			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
  
  				trycounter = NBuffers;
  			}
--- 314,324 ----
  		{
  			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				if (BUF_STATE_GET_USAGECOUNT(state) > BM_MAX_USAGE_COUNT)
! 					pg_atomic_fetch_sub_u32(&buf->state,
! 						BUF_USAGECOUNT_ONE * (BM_MAX_USAGE_COUNT - BUF_STATE_GET_USAGECOUNT(state) + 1));
! 				else
! 					pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
  
  				trycounter = NBuffers;
  			}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index 3d3e0d5..bc61f19
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 31,44 ****
   * 4 bit usage count
   * 18 bit refcount
   */
! #define BUF_REFCOUNT_ONE 1
! #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
  #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
- #define BUF_USAGECOUNT_MASK 0x003C0000U
- #define BUF_USAGECOUNT_ONE (1U << 18)
- #define BUF_USAGECOUNT_SHIFT 18
  #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
- #define BUF_FLAG_MASK 0xFFC00000U
  
  /*
   * Flags for buffer descriptors
--- 31,43 ----
   * 4 bit usage count
   * 18 bit refcount
   */
! #define BUF_REFCOUNT_MASK ((1U << 16) - 1)
! #define BUF_FLAG_MASK 0x3FF0000U
! #define BUF_USAGECOUNT_MASK 0xFC000000U
! #define BUF_USAGECOUNT_ONE (1U << 26)
! #define BUF_USAGECOUNT_SHIFT 26
  #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
  #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
  
  /*
   * Flags for buffer descriptors
***************
*** 46,61 ****
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
--- 45,60 ----
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 16)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 17)		/* data needs writing */
! #define BM_VALID				(1U << 18)		/* data is valid */
! #define BM_TAG_VALID			(1U << 19)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 20)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 21)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 22)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 23)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 24)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 25)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
lwlock-increment.patchapplication/octet-stream; name=lwlock-increment.patchDownload
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index b13ebc6..fe0938a
*** a/src/backend/storage/lmgr/lwlock.c
--- b/src/backend/storage/lmgr/lwlock.c
*************** LWLockAttemptLock(LWLock *lock, LWLockMo
*** 584,642 ****
  {
  	uint32		old_state;
  
! 	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
! 
! 	/*
! 	 * Read once outside the loop, later iterations will get the newer value
! 	 * via compare & exchange.
! 	 */
! 	old_state = pg_atomic_read_u32(&lock->state);
! 
! 	/* loop until we've determined whether we could acquire the lock or not */
! 	while (true)
  	{
! 		uint32		desired_state;
! 		bool		lock_free;
! 
! 		desired_state = old_state;
  
! 		if (mode == LW_EXCLUSIVE)
  		{
! 			lock_free = (old_state & LW_LOCK_MASK) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_EXCLUSIVE;
  		}
  		else
  		{
! 			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_SHARED;
  		}
  
! 		/*
! 		 * Attempt to swap in the state we are expecting. If we didn't see
! 		 * lock to be free, that's just the old value. If we saw it as free,
! 		 * we'll attempt to mark it acquired. The reason that we always swap
! 		 * in the value is that this doubles as a memory barrier. We could try
! 		 * to be smarter and only swap in values if we saw the lock as free,
! 		 * but benchmark haven't shown it as beneficial so far.
! 		 *
! 		 * Retry if the value changed since we last looked at it.
! 		 */
! 		if (pg_atomic_compare_exchange_u32(&lock->state,
! 										   &old_state, desired_state))
  		{
! 			if (lock_free)
  			{
  				/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 				if (mode == LW_EXCLUSIVE)
! 					lock->owner = MyProc;
  #endif
  				return false;
  			}
! 			else
! 				return true;	/* someobdy else has the lock */
  		}
  	}
  	pg_unreachable();
--- 584,629 ----
  {
  	uint32		old_state;
  
! 	if (mode == LW_SHARED)
  	{
! 		/* Optimistically try to get the shared lock */
! 		old_state = pg_atomic_fetch_add_u32(&lock->state, LW_VAL_SHARED);
  
! 		if (old_state & LW_VAL_EXCLUSIVE)
  		{
! 			/* Failed: exclusive lock is already held. Return the state back. */
! 			old_state = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
! 			return true;
  		}
  		else
  		{
! 			/* Succeed: exclusive lock isn't held.*/
! 			return false;
  		}
+ 	}
+ 	else if (mode == LW_EXCLUSIVE)
+ 	{
+ 		/* Optimistically try to get the exclusive lock */
+ 		old_state = pg_atomic_read_u32(&lock->state);
  
! 		if ((old_state & LW_LOCK_MASK) == 0)
  		{
! 			uint32		desired_state = old_state | LW_VAL_EXCLUSIVE;
! 
! 			if (pg_atomic_compare_exchange_u32(&lock->state,
! 											   &old_state, desired_state))
  			{
  				/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 				lock->owner = MyProc;
  #endif
  				return false;
  			}
! 			return true;
! 		}
! 		else
! 		{
! 			return true;
  		}
  	}
  	pg_unreachable();
*************** LWLockRelease(LWLock *lock)
*** 1563,1573 ****
  
  	/*
  	 * We're still waiting for backends to get scheduled, don't wake them up
! 	 * again.
  	 */
  	if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
  		(LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
! 		(oldstate & LW_LOCK_MASK) == 0)
  		check_waiters = true;
  	else
  		check_waiters = false;
--- 1550,1562 ----
  
  	/*
  	 * We're still waiting for backends to get scheduled, don't wake them up
! 	 * again. Since shared locks are taken optimistically we can observe them
! 	 * during release an exclusive lock. We should still check waiters in this
! 	 * case because those shared locks aren't actullay taked.
  	 */
  	if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
  		(LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
! 		((oldstate & LW_LOCK_MASK) == 0 || mode == LW_EXCLUSIVE))
  		check_waiters = true;
  	else
  		check_waiters = false;
pinunpin-ibm-asm.patchapplication/octet-stream; name=pinunpin-ibm-asm.patchDownload
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 2ebd4a9..14898dc
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1474,1479 ****
--- 1474,1504 ----
  	return ReadBuffer(relation, blockNum);
  }
  
+ static uint32
+ TryPinBuffer(volatile pg_atomic_uint32 *state)
+ {
+ 	uint32	result;
+ 
+ 	__asm__ __volatile__(
+ "0:	lwarx   %0,0,%2		\n"
+ "	and     4,%0,%3		\n" /* check for BM_LOCKED */
+ "	cmpwi   4,0			\n"
+ "	bne-    2f			\n"
+ "	addi    %0,%0,1		\n" /* refcount++ */
+ "	and     4,%0,%4		\n" /* check for BUF_USAGECOUNT */
+ "	cmp     0,4,%5		\n"
+ "	beq     1f			\n"
+ "	add     %0,%0,%6	\n" /* usagecount++ */
+ "1:	stwcx.  %0,0,%2		\n"
+ "	bne-    0b			\n"
+ "2:	isync				\n"
+ :	"=&r"(result), "+m"(*state)
+ :	"r"(state), "r"(BM_LOCKED), "r"(BUF_USAGECOUNT_MASK), "r"(BM_MAX_USAGE_COUNT << BUF_USAGECOUNT_SHIFT), "r"(BUF_USAGECOUNT_ONE)
+ :	"memory", "cc", "r4");
+ 
+ 	return result;
+ }
+ 
  /*
   * PinBuffer -- make buffer unavailable for replacement.
   *
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1504,1541 ****
  	if (ref == NULL)
  	{
  		uint32 state;
- 		uint32 oldstate;
  
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
- 
- 		state = pg_atomic_read_u32(&buf->state);
- 		oldstate = state;
- 
  		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
  				break;
  
- 			/* get ready for next loop, oldstate has been updated by cas */
- 			state = oldstate;
- 		}
  		result = (state & BM_VALID) != 0;
  	}
  	else
--- 1529,1546 ----
  	if (ref == NULL)
  	{
  		uint32 state;
  
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
  		while (true)
  		{
! 			state = TryPinBuffer(&buf->state);
! 			if ((state & BM_LOCKED) == 0)
  				break;
+ 			pg_spin_delay();
+ 		};
  
  		result = (state & BM_VALID) != 0;
  	}
  	else
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index b13ebc6..bda940d
*** a/src/backend/storage/lmgr/lwlock.c
--- b/src/backend/storage/lmgr/lwlock.c
*************** LWLockInitialize(LWLock *lock, int tranc
*** 570,645 ****
  	dlist_init(&lock->waiters);
  }
  
- /*
-  * Internal function that tries to atomically acquire the lwlock in the passed
-  * in mode.
-  *
-  * This function will not block waiting for a lock to become free - that's the
-  * callers job.
-  *
-  * Returns true if the lock isn't free and we need to wait.
-  */
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		old_state;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
! 	/*
! 	 * Read once outside the loop, later iterations will get the newer value
! 	 * via compare & exchange.
! 	 */
! 	old_state = pg_atomic_read_u32(&lock->state);
! 
! 	/* loop until we've determined whether we could acquire the lock or not */
! 	while (true)
  	{
! 		uint32		desired_state;
! 		bool		lock_free;
! 
! 		desired_state = old_state;
  
! 		if (mode == LW_EXCLUSIVE)
! 		{
! 			lock_free = (old_state & LW_LOCK_MASK) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_EXCLUSIVE;
! 		}
! 		else
! 		{
! 			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_SHARED;
! 		}
  
! 		/*
! 		 * Attempt to swap in the state we are expecting. If we didn't see
! 		 * lock to be free, that's just the old value. If we saw it as free,
! 		 * we'll attempt to mark it acquired. The reason that we always swap
! 		 * in the value is that this doubles as a memory barrier. We could try
! 		 * to be smarter and only swap in values if we saw the lock as free,
! 		 * but benchmark haven't shown it as beneficial so far.
! 		 *
! 		 * Retry if the value changed since we last looked at it.
! 		 */
! 		if (pg_atomic_compare_exchange_u32(&lock->state,
! 										   &old_state, desired_state))
! 		{
! 			if (lock_free)
! 			{
! 				/* Great! Got the lock. */
! #ifdef LOCK_DEBUG
! 				if (mode == LW_EXCLUSIVE)
! 					lock->owner = MyProc;
! #endif
! 				return false;
! 			}
! 			else
! 				return true;	/* someobdy else has the lock */
! 		}
! 	}
! 	pg_unreachable();
  }
  
  /*
--- 570,611 ----
  	dlist_init(&lock->waiters);
  }
  
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		mask, increment;
! 	bool		result;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
! 	if (mode == LW_EXCLUSIVE)
  	{
! 		mask = LW_LOCK_MASK;
! 		increment = LW_VAL_EXCLUSIVE;
! 	}
! 	else
! 	{
! 		mask = LW_VAL_EXCLUSIVE;
! 		increment = LW_VAL_SHARED;
! 	}
  
! 	__asm__ __volatile__(
! "0:	lwarx   3,0,%4		\n"
! "	and     4,3,%2		\n"
! "	cmpwi   4,0			\n"
! "	bne-    1f			\n"
! "	add     3,3,%3		\n"
! "	stwcx.  3,0,%4		\n"
! "	bne-    0b			\n"
! "	li      %0,0		\n"
! "	b       2f			\n"
! "1: li      %0,1		\n"
! "2:	isync				\n"
! :	"=&r"(result), "+m"(lock->state)
! :	"r"(mask), "r"(increment), "r"(&lock->state)
! :	"memory", "cc", "r3", "r4");
  
! 	return result;
  }
  
  /*
#33Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#32)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 2015-12-08 12:53:49 +0300, Alexander Korotkov wrote:

​This is why atomic increment *could be* cheaper than loop over CAS and, it
worth having experiments. ​Another idea is that we can put arbitrary logic
between lwarx and stwcx. Thus, we can implement PinBuffer using single loop
of lwarx and stwcx which could be better than loop of CAS.

You can't really put that much between an ll/sc - the hardware is only
able to track a very limited number of cacheline references.

3) lwlock-increment.patch – LWLockAttemptLock change state using atomic
increment operation instead of loop of CAS. This patch does it for
LWLockAttemptLock like pinunpin-increment.patch does for PinBuffer.
Actually, this patch is not directly related to buffer manager. However,
it's nice to test loop of CAS vs atomic increment in different places.

Yea, that's a worthwhile improvement. Actually it's how the first
versions of the lwlock patches worked - unfortunately I couldn't see big
differences on hardware I had available at the time.

There's some more trickyness required than what you have in your patch
(afaics at least). The problem is that when you 'optimistically'
increment by LW_VAL_SHARED and notice that there actually was another
locker, you possibly, until you've 'fixed' the state, are blocking new
exclusive lockers from acquiring the locks. So you additionally need to
do special handling in these cases, and check the queue more.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#34Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#33)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Dec 8, 2015 at 1:04 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-12-08 12:53:49 +0300, Alexander Korotkov wrote:

​This is why atomic increment *could be* cheaper than loop over CAS and,

it

worth having experiments. ​Another idea is that we can put arbitrary

logic

between lwarx and stwcx. Thus, we can implement PinBuffer using single

loop

of lwarx and stwcx which could be better than loop of CAS.

You can't really put that much between an ll/sc - the hardware is only
able to track a very limited number of cacheline references.

​I have some doubts about this, but I didn't find the place where it's​
​ explicitly documented. In the case of LWLockAttemptLock it not very much
between
lwarx/stwcx
​: 4 instructions while CAS have 2 instructions.
Could you please share some link to docs, if any?​

3) lwlock-increment.patch – LWLockAttemptLock change state using atomic
increment operation instead of loop of CAS. This patch does it for
LWLockAttemptLock like pinunpin-increment.patch does for PinBuffer.
Actually, this patch is not directly related to buffer manager. However,
it's nice to test loop of CAS vs atomic increment in different places.

Yea, that's a worthwhile improvement. Actually it's how the first
versions of the lwlock patches worked - unfortunately I couldn't see big
differences on hardware I had available at the time.

There's some more trickyness required than what you have in your patch
(afaics at least). The problem is that when you 'optimistically'
increment by LW_VAL_SHARED and notice that there actually was another
locker, you possibly, until you've 'fixed' the state, are blocking new
exclusive lockers from acquiring the locks. So you additionally need to
do special handling in these cases, and check the queue more.

​Agree. This patch need to be carefully verified. Current experiments just
show that it is promising direction for improvement. I'll come with better
version of this patch.

Also, after testing on large machines I have another observation to share.
For now, LWLock doesn't guarantee that exclusive lock would be ever
acquired (assuming each shared lock duration is finite). It because when
there is no exclusive lock, new shared locks aren't queued and LWLock state
is changed directly. Thus, process which tries to acquire exclusive lock
have to wait for gap in shared locks. But with high concurrency for shared
lock that could happen very rare, say never.

We did see this on big Intel machine in practice. pgbench -S gets shared
ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#35Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#34)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Dec 8, 2015 at 3:56 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

​Agree. This patch need to be carefully verified. Current experiments just
show that it is promising direction for improvement. I'll come with better
version of this patch.

Also, after testing on large machines I have another observation to share.
For now, LWLock doesn't guarantee that exclusive lock would be ever
acquired (assuming each shared lock duration is finite). It because when
there is no exclusive lock, new shared locks aren't queued and LWLock state
is changed directly. Thus, process which tries to acquire exclusive lock
have to wait for gap in shared locks.

I think this has the potential to starve exclusive lockers in worst case.

But with high concurrency for shared lock that could happen very rare, say
never.

We did see this on big Intel machine in practice. pgbench -S gets shared
ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

I think timeout based solution would lead to giving priority to
exclusive lock waiters (assume a case where each of exclusive
lock waiter timesout one after another) and make shared lockers
wait and a timer based solution might turn out to be costly for
general cases where wait is not so long. Another way could be to
check if the Exclusive locker needs to go for repeated wait for a
couple of times, then we can set such a bit.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#36Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#35)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Dec 8, 2015 at 6:00 PM, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Tue, Dec 8, 2015 at 3:56 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

​Agree. This patch need to be carefully verified. Current experiments
just show that it is promising direction for improvement. I'll come with
better version of this patch.

Also, after testing on large machines I have another observation to
share. For now, LWLock doesn't guarantee that exclusive lock would be ever
acquired (assuming each shared lock duration is finite). It because when
there is no exclusive lock, new shared locks aren't queued and LWLock state
is changed directly. Thus, process which tries to acquire exclusive lock
have to wait for gap in shared locks.

I think this has the potential to starve exclusive lockers in worst case.

But with high concurrency for shared lock that could happen very rare,
say never.

We did see this on big Intel machine in practice. pgbench -S gets shared
ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

I think timeout based solution would lead to giving priority to
exclusive lock waiters (assume a case where each of exclusive
lock waiter timesout one after another) and make shared lockers
wait and a timer based solution might turn out to be costly for
general cases where wait is not so long.

​Since all lwlock waiters are ordered in the queue, we can let only first
waiter to set this bit.​
Anyway, once bit is set, shared lockers would be added to the queue. They
would get the lock in queue order.

Another way could be to
check if the Exclusive locker needs to go for repeated wait for a
couple of times, then we can set such a bit.

​I'm not sure what do you mean by repeated wait. Do you mean exclusive
locker was waked twice up by timeout? Because now, without timeout,
exclusive locker wouldn't be waked up until all shared locks are released.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#37Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#36)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Dec 9, 2015 at 2:17 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Tue, Dec 8, 2015 at 6:00 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Tue, Dec 8, 2015 at 3:56 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

​Agree. This patch need to be carefully verified. Current experiments
just show that it is promising direction for improvement. I'll come with
better version of this patch.

Also, after testing on large machines I have another observation to
share. For now, LWLock doesn't guarantee that exclusive lock would be ever
acquired (assuming each shared lock duration is finite). It because when
there is no exclusive lock, new shared locks aren't queued and LWLock state
is changed directly. Thus, process which tries to acquire exclusive lock
have to wait for gap in shared locks.

I think this has the potential to starve exclusive lockers in worst case.

But with high concurrency for shared lock that could happen very rare,
say never.

We did see this on big Intel machine in practice. pgbench -S gets shared
ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

I think timeout based solution would lead to giving priority to
exclusive lock waiters (assume a case where each of exclusive
lock waiter timesout one after another) and make shared lockers
wait and a timer based solution might turn out to be costly for
general cases where wait is not so long.

​Since all lwlock waiters are ordered in the queue, we can let only first
waiter to set this bit.​

Thats okay, but still every time an Exclusive locker woke up, the
threshold time for its wait might be already over and it will set the
bit. In theory, that looks okay, but as compare to current algorithm
it will make more shared lockers to be added into wait queue.

Anyway, once bit is set, shared lockers would be added to the queue. They
would get the lock in queue order.

Ye thats right, but I think in general the solution to this problem
should be don't let any Exclusive locker to starve and still allow
as many shared lockers as possible. I think here it is important
how we define starving, should it be based on time or something
else? I find timer based solution somewhat less suitable, but may
be it is okay, if there is no other better way.

Another way could be to

check if the Exclusive locker needs to go for repeated wait for a
couple of times, then we can set such a bit.

​I'm not sure what do you mean by repeated wait. Do you mean exclusive
locker was waked twice up by timeout?

I mean to say once the Exclusive locker is woken up, it again
re-tries to acquire the lock as it does today, but if it finds that the
number of retries is greater than certain threshold (let us say 10),
then we sit the bit.

Because now, without timeout, exclusive locker wouldn't be waked up until
all shared locks are released.

Does LWLockWakeup() work that way? I thought it works such
that once an Exclusive locker is encountered in the wait queue, it
just wakes that and won't try to wake any further waiters.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#38Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#37)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Dec 10, 2015 at 9:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Wed, Dec 9, 2015 at 2:17 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Tue, Dec 8, 2015 at 6:00 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Tue, Dec 8, 2015 at 3:56 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

​Agree. This patch need to be carefully verified. Current experiments
just show that it is promising direction for improvement. I'll come with
better version of this patch.

Also, after testing on large machines I have another observation to
share. For now, LWLock doesn't guarantee that exclusive lock would be ever
acquired (assuming each shared lock duration is finite). It because when
there is no exclusive lock, new shared locks aren't queued and LWLock state
is changed directly. Thus, process which tries to acquire exclusive lock
have to wait for gap in shared locks.

I think this has the potential to starve exclusive lockers in worst case.

But with high concurrency for shared lock that could happen very rare,
say never.

We did see this on big Intel machine in practice. pgbench -S gets
shared ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

I think timeout based solution would lead to giving priority to
exclusive lock waiters (assume a case where each of exclusive
lock waiter timesout one after another) and make shared lockers
wait and a timer based solution might turn out to be costly for
general cases where wait is not so long.

​Since all lwlock waiters are ordered in the queue, we can let only first
waiter to set this bit.​

Thats okay, but still every time an Exclusive locker woke up, the
threshold time for its wait might be already over and it will set the
bit. In theory, that looks okay, but as compare to current algorithm
it will make more shared lockers to be added into wait queue.

Anyway, once bit is set, shared lockers would be added to the queue. They
would get the lock in queue order.

Ye thats right, but I think in general the solution to this problem
should be don't let any Exclusive locker to starve and still allow
as many shared lockers as possible. I think here it is important
how we define starving, should it be based on time or something
else? I find timer based solution somewhat less suitable, but may
be it is okay, if there is no other better way.

​Yes, we probably should find something better.​

Another way could be to

check if the Exclusive locker needs to go for repeated wait for a
couple of times, then we can set such a bit.

​I'm not sure what do you mean by repeated wait. Do you mean exclusive
locker was waked twice up by timeout?

I mean to say once the Exclusive locker is woken up, it again
re-tries to acquire the lock as it does today, but if it finds that the
number of retries is greater than certain threshold (let us say 10),
then we sit the bit.

​Yes, there is a cycle with retries in LWLockAcquire function. The case of
retry is when ​waiter is waked up, but someone other steal the lock before
him. Lock waiter is waked up by lock releaser only when lock becomes free.
But in the case of high concurrency for shared lock, it almost never
becomes free. So, exclusive locker would be never waked up. I'm pretty sure
this happens on big Intel machine while we do the benchmark. So, relying on
number of retries wouldn't work in this case.
I'll do the tests to verify if retries happens in our case.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#39Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#38)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2015-12-11 15:56:46 +0300, Alexander Korotkov wrote:

On Thu, Dec 10, 2015 at 9:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

We did see this on big Intel machine in practice. pgbench -S gets
shared ProcArrayLock very frequently. Since some number of connections is
achieved, new connections hangs on getting exclusive ProcArrayLock. I think
we could do some workaround for this problem. For instance, when exclusive
lock waiter have some timeout it could set some special bit which prevents
others to get new shared locks.

Ye thats right, but I think in general the solution to this problem
should be don't let any Exclusive locker to starve and still allow
as many shared lockers as possible. I think here it is important
how we define starving, should it be based on time or something
else? I find timer based solution somewhat less suitable, but may
be it is okay, if there is no other better way.

​Yes, we probably should find something better.​

Another way could be to

check if the Exclusive locker needs to go for repeated wait for a
couple of times, then we can set such a bit.

​I'm not sure what do you mean by repeated wait. Do you mean exclusive
locker was waked twice up by timeout?

I mean to say once the Exclusive locker is woken up, it again
re-tries to acquire the lock as it does today, but if it finds that the
number of retries is greater than certain threshold (let us say 10),
then we sit the bit.

​Yes, there is a cycle with retries in LWLockAcquire function. The case of
retry is when ​waiter is waked up, but someone other steal the lock before
him. Lock waiter is waked up by lock releaser only when lock becomes free.
But in the case of high concurrency for shared lock, it almost never
becomes free. So, exclusive locker would be never waked up. I'm pretty sure
this happens on big Intel machine while we do the benchmark. So, relying on
number of retries wouldn't work in this case.
I'll do the tests to verify if retries happens in our case.

I seriously doubt that making lwlocks fairer is the right way to go
here. In my testing the "unfairness" is essential to performance - the
number of context switches otherwise increases massively.

I think in this case its better to work on making the lock less
contended, rather than making micro-optimizations around the locking
behaviour.

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#40Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#39)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Dec 11, 2015 at 6:34 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-12-11 15:56:46 +0300, Alexander Korotkov wrote:

Yes, there is a cycle with retries in LWLockAcquire function. The case

of

retry is when waiter is waked up, but someone other steal the lock

before

him. Lock waiter is waked up by lock releaser only when lock becomes

free.

But in the case of high concurrency for shared lock, it almost never
becomes free. So, exclusive locker would be never waked up. I'm pretty

sure

this happens on big Intel machine while we do the benchmark. So,

relying on

number of retries wouldn't work in this case.
I'll do the tests to verify if retries happens in our case.

makes sense and if retries never happen, then I think changing
LWLockRelease()
such that it should wake the waiters if there are waiters on a lock and it
has not
waked them for some threshold number of times or something like that might
work.

I seriously doubt that making lwlocks fairer is the right way to go
here. In my testing the "unfairness" is essential to performance - the
number of context switches otherwise increases massively.

Agreed, if the change being discussed hurts in any kind of scenario, then
we should better not do it, OTOH the case described by Alexander seems
to be genuine and I have seen similar complaint by customer in the past
for another database I worked with and the reason for the problem is same.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#41Michael Paquier
michael.paquier@gmail.com
In reply to: Amit Kapila (#40)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Dec 13, 2015 at 11:05 PM, Amit Kapila <amit.kapila16@gmail.com> wrote:

On Fri, Dec 11, 2015 at 6:34 PM, Andres Freund <andres@anarazel.de> wrote:

On 2015-12-11 15:56:46 +0300, Alexander Korotkov wrote:

Yes, there is a cycle with retries in LWLockAcquire function. The case
of
retry is when waiter is waked up, but someone other steal the lock
before
him. Lock waiter is waked up by lock releaser only when lock becomes
free.
But in the case of high concurrency for shared lock, it almost never
becomes free. So, exclusive locker would be never waked up. I'm pretty
sure
this happens on big Intel machine while we do the benchmark. So, relying
on
number of retries wouldn't work in this case.
I'll do the tests to verify if retries happens in our case.

makes sense and if retries never happen, then I think changing
LWLockRelease()
such that it should wake the waiters if there are waiters on a lock and it
has not
waked them for some threshold number of times or something like that might
work.

I seriously doubt that making lwlocks fairer is the right way to go
here. In my testing the "unfairness" is essential to performance - the
number of context switches otherwise increases massively.

Agreed, if the change being discussed hurts in any kind of scenario, then
we should better not do it, OTOH the case described by Alexander seems
to be genuine and I have seen similar complaint by customer in the past
for another database I worked with and the reason for the problem is same.

I have moved this patch to next CF..
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#42Dilip Kumar
dilipbalaut@gmail.com
In reply to: Michael Paquier (#41)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Dec 24, 2015 at 8:26 AM, Michael Paquier <michael.paquier@gmail.com>
wrote:

On Sun, Dec 13, 2015 at 11:05 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Fri, Dec 11, 2015 at 6:34 PM, Andres Freund <andres@anarazel.de>

wrote:

I was looking into this patch, overall patch looks good to me, I want to
know what is the current state of this patch, is there some pending task in
this patch ?

Patch was not applying on head so i have re based it and re based version is
attached in the mail.

I have done some performance testing also..

Summary:
---------------
1. In my test for readonly workload i have observed some improvement for
scale factor 1000.
2. I have also observed some regression with scale factor 300 (I can't say
it's actual regression or just run to run variance), I thought that may be
problem with lower scale factor so tested with scale factor 100 but with
s.f. 100 looks fine.

Machine Detail:
cpu : POWER8
cores: 24 (192 with HT)

Non Default Parameter:
------------------------
Shared Buffer= 30GB
max_wal_size= 10GB
max_connections=500

Test1:
pgbench -i -s 1000 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 19753 19493
32 344059 336773
64 495708 540425
128 564358 685212
256 466562 639059

Test2:
pgbench -i -s 300 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

Test3:
pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Attachments:

pinunpin-cas_rebased.patchapplication/x-patch; name=pinunpin-cas_rebased.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index b423aa7..04862d7 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -121,12 +121,9 @@ InitBufferPool(void)
 			BufferDesc *buf = GetBufferDescriptor(i);
 
 			CLEAR_BUFFERTAG(buf->tag);
-			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
-			buf->wait_backend_pid = 0;
 
-			SpinLockInit(&buf->buf_hdr_lock);
+			pg_atomic_init_u32(&buf->state, 0);
+			buf->wait_backend_pid = 0;
 
 			buf->buf_id = i;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7141eb8..f69faeb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -51,7 +51,6 @@
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
-
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
@@ -126,7 +125,7 @@ static BufferDesc *PinCountWaitBuf = NULL;
  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
  * memory allocations in NewPrivateRefCountEntry() which can be important
- * because in some scenarios it's called with a spinlock held...
+ * because in some scenarios it's called with a header lock held...
  */
 static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
 static HTAB *PrivateRefCountHash = NULL;
@@ -775,9 +774,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		if (isLocalBuf)
 		{
-			/* Only need to adjust flags */
-			Assert(bufHdr->flags & BM_VALID);
-			bufHdr->flags &= ~BM_VALID;
+			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
+			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
 		}
 		else
 		{
@@ -789,8 +787,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			do
 			{
 				LockBufHdr(bufHdr);
-				Assert(bufHdr->flags & BM_VALID);
-				bufHdr->flags &= ~BM_VALID;
+				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
+				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
 				UnlockBufHdr(bufHdr);
 			} while (!StartBufferIO(bufHdr, true));
 		}
@@ -808,7 +806,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * it's not been recycled) but come right back here to try smgrextend
 	 * again.
 	 */
-	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
+	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
 
 	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
@@ -886,7 +884,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if (isLocalBuf)
 	{
 		/* Only need to adjust flags */
-		bufHdr->flags |= BM_VALID;
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
 	}
 	else
 	{
@@ -940,7 +938,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferTag	oldTag;			/* previous identity of selected buffer */
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
 	int			buf_id;
 	BufferDesc *buf;
 	bool		valid;
@@ -1002,24 +1000,26 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
+		uint32	state;
+
 		/*
-		 * Ensure, while the spinlock's not yet held, that there's a free
+		 * Ensure, while the header lock isn't yet held, that there's a free
 		 * refcount entry.
 		 */
 		ReservePrivateRefCountEntry();
 
 		/*
 		 * Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!
+		 * lock still held!
 		 */
-		buf = StrategyGetBuffer(strategy);
+		buf = StrategyGetBuffer(strategy, &state);
 
-		Assert(buf->refcount == 0);
+		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
 
-		/* Must copy buffer flags while we still hold the spinlock */
-		oldFlags = buf->flags;
+		/* Must copy buffer flags while we still hold the header lock */
+		oldFlags = state & BUF_FLAG_MASK;
 
-		/* Pin the buffer and then release the buffer spinlock */
+		/* Pin the buffer and then release the buffer header lock */
 		PinBuffer_Locked(buf);
 
 		/*
@@ -1204,7 +1204,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/*
 		 * Need to lock the buffer header too in order to change its tag.
 		 */
-		LockBufHdr(buf);
+		state = LockBufHdr(buf);
 
 		/*
 		 * Somebody could have pinned or re-dirtied the buffer while we were
@@ -1212,8 +1212,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * recycle this buffer; we must undo everything we've done and start
 		 * over with a new victim buffer.
 		 */
-		oldFlags = buf->flags;
-		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+		oldFlags = state & BUF_FLAG_MASK;
+		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
 			break;
 
 		UnlockBufHdr(buf);
@@ -1234,12 +1234,19 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
-	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
+	pg_atomic_fetch_and_u32(&buf->state,
+							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
+							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
+							  BM_PERMANENT |
+							  BUF_USAGECOUNT_MASK));
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
-		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+		pg_atomic_fetch_or_u32(&buf->state,
+							   BM_TAG_VALID | BM_PERMANENT |
+							   BUF_USAGECOUNT_ONE);
 	else
-		buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 1;
+		pg_atomic_fetch_or_u32(&buf->state,
+							   BM_TAG_VALID |
+							   BUF_USAGECOUNT_ONE);
 
 	UnlockBufHdr(buf);
 
@@ -1269,7 +1276,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
  * freelist.
  *
- * The buffer header spinlock must be held at entry.  We drop it before
+ * The buffer header lock must be held at entry.  We drop it before
  * returning.  (This is sane because the caller must have locked the
  * buffer in order to be sure it should be dropped.)
  *
@@ -1288,9 +1295,10 @@ InvalidateBuffer(BufferDesc *buf)
 	BufferTag	oldTag;
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
+	uint32		state;
 
-	/* Save the original buffer tag before dropping the spinlock */
+	/* Save the original buffer tag before dropping the header lock */
 	oldTag = buf->tag;
 
 	UnlockBufHdr(buf);
@@ -1312,7 +1320,7 @@ retry:
 	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 
 	/* Re-lock the buffer header */
-	LockBufHdr(buf);
+	state = LockBufHdr(buf);
 
 	/* If it's changed while we were waiting for lock, do nothing */
 	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
@@ -1331,7 +1339,7 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	if (BUF_STATE_GET_REFCOUNT(state) != 0)
 	{
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
@@ -1346,10 +1354,9 @@ retry:
 	 * Clear out the buffer's tag and flags.  We must do this to ensure that
 	 * linear scans of the buffer array don't think the buffer is valid.
 	 */
-	oldFlags = buf->flags;
+	oldFlags = state & BUF_FLAG_MASK;
 	CLEAR_BUFFERTAG(buf->tag);
-	buf->flags = 0;
-	buf->usage_count = 0;
+	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
 
 	UnlockBufHdr(buf);
 
@@ -1383,6 +1390,7 @@ void
 MarkBufferDirty(Buffer buffer)
 {
 	BufferDesc *bufHdr;
+	uint32		state;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -1399,14 +1407,14 @@ MarkBufferDirty(Buffer buffer)
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
-	LockBufHdr(bufHdr);
+	state = LockBufHdr(bufHdr);
 
-	Assert(bufHdr->refcount > 0);
+	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
 
 	/*
 	 * If the buffer was not dirty already, do vacuum accounting.
 	 */
-	if (!(bufHdr->flags & BM_DIRTY))
+	if (!(state & BM_DIRTY))
 	{
 		VacuumPageDirty++;
 		pgBufferUsage.shared_blks_dirtied++;
@@ -1414,7 +1422,7 @@ MarkBufferDirty(Buffer buffer)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
 
-	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+	pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
 
 	UnlockBufHdr(bufHdr);
 }
@@ -1456,7 +1464,7 @@ ReleaseAndReadBuffer(Buffer buffer,
 		else
 		{
 			bufHdr = GetBufferDescriptor(buffer - 1);
-			/* we have pin, so it's ok to examine tag without spinlock */
+			/* we have pin, so it's ok to examine tag without header lock */
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
 				bufHdr->tag.forkNum == forkNum)
@@ -1484,7 +1492,7 @@ ReleaseAndReadBuffer(Buffer buffer,
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
- * some callers to avoid an extra spinlock cycle.
+ * some callers to avoid an extra header lock cycle.
  */
 static bool
 PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
@@ -1497,23 +1505,40 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 
 	if (ref == NULL)
 	{
+		uint32 state;
+		uint32 oldstate;
+
 		ReservePrivateRefCountEntry();
 		ref = NewPrivateRefCountEntry(b);
 
-		LockBufHdr(buf);
-		buf->refcount++;
-		if (strategy == NULL)
-		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
-		}
-		else
+
+		state = pg_atomic_read_u32(&buf->state);
+		oldstate = state;
+
+		while (true)
 		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
+			/* spin-wait till lock is free */
+			while (state & BM_LOCKED)
+			{
+				pg_spin_delay();
+				state = pg_atomic_read_u32(&buf->state);
+				oldstate = state;
+			}
+
+			/* increase refcount */
+			state += BUF_REFCOUNT_ONE;
+
+			/* increase usagecount unless already max */
+			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
+				state += BUF_USAGECOUNT_ONE;
+
+			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
+				break;
+
+			/* get ready for next loop, oldstate has been updated by cas */
+			state = oldstate;
 		}
-		result = (buf->flags & BM_VALID) != 0;
-		UnlockBufHdr(buf);
+		result = (state & BM_VALID) != 0;
 	}
 	else
 	{
@@ -1529,9 +1554,9 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 
 /*
  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
- * The spinlock is released before return.
+ * The header lock is released before return.
  *
- * As this function is called with the spinlock held, the caller has to
+ * As this function is called with the header lock held, the caller has to
  * previously call ReservePrivateRefCountEntry().
  *
  * Currently, no callers of this function want to modify the buffer's
@@ -1542,7 +1567,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
  * Also all callers only ever use this function when it's known that the
  * buffer can't have a preexisting pin by this backend. That allows us to skip
  * searching the private refcount array & hash, which is a boon, because the
- * spinlock is still held.
+ * header lock is still held.
  *
  * Note: use of this routine is frequently mandatory, not just an optimization
  * to save a spin lock/unlock cycle, because we need to pin a buffer before
@@ -1556,11 +1581,11 @@ PinBuffer_Locked(BufferDesc *buf)
 
 	/*
 	 * As explained, We don't expect any preexisting pins. That allows us to
-	 * manipulate the PrivateRefCount after releasing the spinlock
+	 * manipulate the PrivateRefCount after releasing the header lock
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
-	buf->refcount++;
+	pg_atomic_fetch_add_u32(&buf->state, 1);
 	UnlockBufHdr(buf);
 
 	b = BufferDescriptorGetBuffer(buf);
@@ -1596,29 +1621,39 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 	ref->refcount--;
 	if (ref->refcount == 0)
 	{
+		uint32 state;
+
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
 		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
-		LockBufHdr(buf);
-
-		/* Decrement the shared reference count */
-		Assert(buf->refcount > 0);
-		buf->refcount--;
+		/* Decrement the shared reference count.
+		 *
+		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
+		 * currently all manipulation of ->state for shared buffers is through
+		 * atomics.
+		 */
+		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
 
 		/* Support LockBufferForCleanup() */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
-			buf->refcount == 1)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
-			/* we just released the last pin other than the waiter's */
-			int			wait_backend_pid = buf->wait_backend_pid;
+			state = LockBufHdr(buf);
 
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
-			UnlockBufHdr(buf);
-			ProcSendSignal(wait_backend_pid);
+			if ((state & BM_PIN_COUNT_WAITER) && BUF_STATE_GET_REFCOUNT(state) == 1)
+			{
+				/* we just released the last pin other than the waiter's */
+				int         wait_backend_pid = buf->wait_backend_pid;
+
+				pg_atomic_fetch_and_u32(&buf->state,
+										~BM_PIN_COUNT_WAITER);
+				UnlockBufHdr(buf);
+				ProcSendSignal(wait_backend_pid);
+			}
+			else
+				UnlockBufHdr(buf);
 		}
-		else
-			UnlockBufHdr(buf);
 
 		ForgetPrivateRefCountEntry(ref);
 	}
@@ -1637,6 +1672,7 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 static void
 BufferSync(int flags)
 {
+	uint32		state;
 	int			buf_id;
 	int			num_to_scan;
 	int			num_to_write;
@@ -1677,14 +1713,15 @@ BufferSync(int flags)
 		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 
 		/*
-		 * Header spinlock is enough to examine BM_DIRTY, see comment in
+		 * Header lock is enough to examine BM_DIRTY, see comment in
 		 * SyncOneBuffer.
 		 */
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 
-		if ((bufHdr->flags & mask) == mask)
+		if ((state & mask) == mask)
 		{
-			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+			pg_atomic_fetch_or_u32(&bufHdr->state,
+								   BM_CHECKPOINT_NEEDED);
 			num_to_write++;
 		}
 
@@ -1723,7 +1760,7 @@ BufferSync(int flags)
 		 * write the buffer though we didn't need to.  It doesn't seem worth
 		 * guarding against this, though.
 		 */
-		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
 		{
 			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
 			{
@@ -2083,6 +2120,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 {
 	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 	int			result = 0;
+	uint32		state;
 
 	ReservePrivateRefCountEntry();
 
@@ -2095,10 +2133,13 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 * don't worry because our checkpoint.redo points before log record for
 	 * upcoming changes and so we are not required to write such dirty buffer.
 	 */
-	LockBufHdr(bufHdr);
+	state = LockBufHdr(bufHdr);
 
-	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
+		BUF_STATE_GET_USAGECOUNT(state) == 0)
+	{
 		result |= BUF_REUSABLE;
+	}
 	else if (skip_recently_used)
 	{
 		/* Caller told us not to write recently-used buffers */
@@ -2106,7 +2147,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 		return result;
 	}
 
-	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
+	if (!(state & BM_VALID) || !(state & BM_DIRTY))
 	{
 		/* It's clean, so nothing to do */
 		UnlockBufHdr(bufHdr);
@@ -2258,6 +2299,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	int32		loccount;
 	char	   *path;
 	BackendId	backend;
+	uint32		state;
 
 	Assert(BufferIsValid(buffer));
 	if (BufferIsLocal(buffer))
@@ -2275,12 +2317,13 @@ PrintBufferLeakWarning(Buffer buffer)
 
 	/* theoretically we should lock the bufhdr here */
 	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+	state = pg_atomic_read_u32(&buf->state);
 	elog(WARNING,
 		 "buffer refcount leak: [%03d] "
 		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
 		 buffer, path,
-		 buf->tag.blockNum, buf->flags,
-		 buf->refcount, loccount);
+		 buf->tag.blockNum, state & BUF_FLAG_MASK,
+		 BUF_STATE_GET_REFCOUNT(state), loccount);
 	pfree(path);
 }
 
@@ -2335,7 +2378,7 @@ BufferGetBlockNumber(Buffer buffer)
 	else
 		bufHdr = GetBufferDescriptor(buffer - 1);
 
-	/* pinned, so OK to read tag without spinlock */
+	/* pinned, so OK to read tag without lock */
 	return bufHdr->tag.blockNum;
 }
 
@@ -2358,7 +2401,7 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
 	else
 		bufHdr = GetBufferDescriptor(buffer - 1);
 
-	/* pinned, so OK to read tag without spinlock */
+	/* pinned, so OK to read tag without lock */
 	*rnode = bufHdr->tag.rnode;
 	*forknum = bufHdr->tag.forkNum;
 	*blknum = bufHdr->tag.blockNum;
@@ -2426,7 +2469,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	recptr = BufferGetLSN(buf);
 
 	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf->flags &= ~BM_JUST_DIRTIED;
+	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
 	UnlockBufHdr(buf);
 
 	/*
@@ -2446,7 +2489,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
 	 */
-	if (buf->flags & BM_PERMANENT)
+	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
 		XLogFlush(recptr);
 
 	/*
@@ -2534,13 +2577,13 @@ BufferIsPermanent(Buffer buffer)
 
 	/*
 	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
-	 * need not bother with the buffer header spinlock.  Even if someone else
+	 * need not bother with the buffer header lock.  Even if someone else
 	 * changes the buffer header flags while we're doing this, we assume that
 	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
 	 * old value or the new value, but not random garbage.
 	 */
 	bufHdr = GetBufferDescriptor(buffer - 1);
-	return (bufHdr->flags & BM_PERMANENT) != 0;
+	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
 }
 
 /*
@@ -2640,7 +2683,7 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
-			InvalidateBuffer(bufHdr);	/* releases spinlock */
+			InvalidateBuffer(bufHdr);	/* releases lock */
 		else
 			UnlockBufHdr(bufHdr);
 	}
@@ -2738,7 +2781,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 
 		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
-			InvalidateBuffer(bufHdr);	/* releases spinlock */
+			InvalidateBuffer(bufHdr);	/* releases lock */
 		else
 			UnlockBufHdr(bufHdr);
 	}
@@ -2780,7 +2823,7 @@ DropDatabaseBuffers(Oid dbid)
 
 		LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid)
-			InvalidateBuffer(bufHdr);	/* releases spinlock */
+			InvalidateBuffer(bufHdr);	/* releases lock */
 		else
 			UnlockBufHdr(bufHdr);
 	}
@@ -2876,7 +2919,8 @@ FlushRelationBuffers(Relation rel)
 		{
 			bufHdr = GetLocalBufferDescriptor(i);
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+				== (BM_VALID | BM_DIRTY))
 			{
 				ErrorContextCallback errcallback;
 				Page		localpage;
@@ -2897,7 +2941,7 @@ FlushRelationBuffers(Relation rel)
 						  localpage,
 						  false);
 
-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
@@ -2925,7 +2969,8 @@ FlushRelationBuffers(Relation rel)
 
 		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+			== (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
@@ -2977,7 +3022,8 @@ FlushDatabaseBuffers(Oid dbid)
 
 		LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
+			== (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
@@ -3109,19 +3155,20 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	 * This routine might get called many times on the same page, if we are
 	 * making the first scan after commit of an xact that added/deleted many
 	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
-	 * do this by not acquiring spinlock if it looks like the status bits are
+	 * do this by not acquiring header lock if it looks like the status bits are
 	 * already set.  Since we make this test unlocked, there's a chance we
 	 * might fail to notice that the flags have just been cleared, and failed
 	 * to reset them, due to memory-ordering issues.  But since this function
 	 * is only intended to be used in cases where failing to write out the
 	 * data would be harmless anyway, it doesn't really matter.
 	 */
-	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
 		XLogRecPtr	lsn = InvalidXLogRecPtr;
 		bool		dirtied = false;
 		bool		delayChkpt = false;
+		uint32		state;
 
 		/*
 		 * If we need to protect hint bit updates from torn writes, WAL-log a
@@ -3132,7 +3179,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		 * We don't check full_page_writes here because that logic is included
 		 * when we call XLogInsert() since the value changes dynamically.
 		 */
-		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
+		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
 			 * If we're in recovery we cannot dirty a page because of a hint.
@@ -3172,8 +3219,12 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		}
 
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (!(bufHdr->flags & BM_DIRTY))
+
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+
+		if (!(state & BM_DIRTY))
 		{
 			dirtied = true;		/* Means "will be dirtied by this action" */
 
@@ -3193,7 +3244,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			if (!XLogRecPtrIsInvalid(lsn))
 				PageSetLSN(page, lsn);
 		}
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
+
 		UnlockBufHdr(bufHdr);
 
 		if (delayChkpt)
@@ -3231,9 +3284,9 @@ UnlockBuffers(void)
 		 * Don't complain if flag bit not set; it could have been reset but we
 		 * got a cancel/die interrupt before getting the signal.
 		 */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
 			buf->wait_backend_pid == MyProcPid)
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
+			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
 
 		UnlockBufHdr(buf);
 
@@ -3328,25 +3381,28 @@ LockBufferForCleanup(Buffer buffer)
 
 	for (;;)
 	{
+		int		state;
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (bufHdr->refcount == 1)
+		state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+		if (BUF_STATE_GET_REFCOUNT(state) == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
 			UnlockBufHdr(bufHdr);
 			return;
 		}
 		/* Failed, so mark myself as waiting for pincount 1 */
-		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
 			UnlockBufHdr(bufHdr);
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			elog(ERROR, "multiple backends attempting to wait for pincount 1");
 		}
 		bufHdr->wait_backend_pid = MyProcPid;
-		bufHdr->flags |= BM_PIN_COUNT_WAITER;
+		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
 		PinCountWaitBuf = bufHdr;
 		UnlockBufHdr(bufHdr);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -3373,9 +3429,9 @@ LockBufferForCleanup(Buffer buffer)
 		 * better be safe.
 		 */
 		LockBufHdr(bufHdr);
-		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
 			bufHdr->wait_backend_pid == MyProcPid)
-			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
+			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
 		UnlockBufHdr(bufHdr);
 
 		PinCountWaitBuf = NULL;
@@ -3417,22 +3473,26 @@ bool
 ConditionalLockBufferForCleanup(Buffer buffer)
 {
 	BufferDesc *bufHdr;
+	uint32		state,
+				refcount;
 
 	Assert(BufferIsValid(buffer));
 
 	if (BufferIsLocal(buffer))
 	{
+		refcount = LocalRefCount[-buffer - 1];
 		/* There should be exactly one pin */
-		Assert(LocalRefCount[-buffer - 1] > 0);
-		if (LocalRefCount[-buffer - 1] != 1)
+		Assert(refcount > 0);
+		if (refcount != 1)
 			return false;
 		/* Nobody else to wait for */
 		return true;
 	}
 
 	/* There should be exactly one local pin */
-	Assert(GetPrivateRefCount(buffer) > 0);
-	if (GetPrivateRefCount(buffer) != 1)
+	refcount = GetPrivateRefCount(buffer);
+	Assert(refcount);
+	if (refcount != 1)
 		return false;
 
 	/* Try to acquire lock */
@@ -3440,9 +3500,11 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 		return false;
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
-	LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
-	if (bufHdr->refcount == 1)
+	state = LockBufHdr(bufHdr);
+	refcount = BUF_STATE_GET_REFCOUNT(state);
+
+	Assert(refcount > 0);
+	if (refcount == 1)
 	{
 		/* Successfully acquired exclusive lock with pincount 1 */
 		UnlockBufHdr(bufHdr);
@@ -3480,17 +3542,17 @@ WaitIO(BufferDesc *buf)
 	 */
 	for (;;)
 	{
-		BufFlags	sv_flags;
+		uint32		state;
 
 		/*
-		 * It may not be necessary to acquire the spinlock to check the flag
+		 * It may not be necessary to acquire the header lock to check the flag
 		 * here, but since this test is essential for correctness, we'd better
 		 * play it safe.
 		 */
-		LockBufHdr(buf);
-		sv_flags = buf->flags;
+		state = LockBufHdr(buf);
 		UnlockBufHdr(buf);
-		if (!(sv_flags & BM_IO_IN_PROGRESS))
+
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
 		LWLockRelease(BufferDescriptorGetIOLock(buf));
@@ -3518,6 +3580,8 @@ WaitIO(BufferDesc *buf)
 static bool
 StartBufferIO(BufferDesc *buf, bool forInput)
 {
+	uint32		state;
+
 	Assert(!InProgressBuf);
 
 	for (;;)
@@ -3528,9 +3592,9 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 */
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
-		LockBufHdr(buf);
+		state = LockBufHdr(buf);
 
-		if (!(buf->flags & BM_IO_IN_PROGRESS))
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 
 		/*
@@ -3546,7 +3610,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 
 	/* Once we get here, there is definitely no I/O active on this buffer */
 
-	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
+	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf);
@@ -3554,7 +3618,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		return false;
 	}
 
-	buf->flags |= BM_IO_IN_PROGRESS;
+	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
 
 	UnlockBufHdr(buf);
 
@@ -3584,15 +3648,19 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 static void
 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
 {
+	uint32		state;
+
 	Assert(buf == InProgressBuf);
 
-	LockBufHdr(buf);
+	state = LockBufHdr(buf);
+
+	Assert(state & BM_IO_IN_PROGRESS);
 
-	Assert(buf->flags & BM_IO_IN_PROGRESS);
-	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
-	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
-	buf->flags |= set_flag_bits;
+	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
+	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
+		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
+
+	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
 
 	UnlockBufHdr(buf);
 
@@ -3617,6 +3685,7 @@ AbortBufferIO(void)
 
 	if (buf)
 	{
+		uint32	state;
 		/*
 		 * Since LWLockReleaseAll has already been called, we're not holding
 		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
@@ -3625,26 +3694,24 @@ AbortBufferIO(void)
 		 */
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
-		LockBufHdr(buf);
-		Assert(buf->flags & BM_IO_IN_PROGRESS);
+		state = LockBufHdr(buf);
+		Assert(state & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
-			Assert(!(buf->flags & BM_DIRTY));
+			Assert(!(state & BM_DIRTY));
+
 			/* We'd better not think buffer is valid yet */
-			Assert(!(buf->flags & BM_VALID));
+			Assert(!(state & BM_VALID));
 			UnlockBufHdr(buf);
 		}
 		else
 		{
-			BufFlags	sv_flags;
-
-			sv_flags = buf->flags;
-			Assert(sv_flags & BM_DIRTY);
+			Assert(state & BM_DIRTY);
 			UnlockBufHdr(buf);
 			/* Issue notice if this is not the first failure... */
-			if (sv_flags & BM_IO_ERROR)
+			if (state & BM_IO_ERROR)
 			{
-				/* Buffer is pinned, so we can read tag without spinlock */
+				/* Buffer is pinned, so we can read tag without header lock */
 				char	   *path;
 
 				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
@@ -3668,7 +3735,7 @@ shared_buffer_write_error_callback(void *arg)
 {
 	BufferDesc *bufHdr = (BufferDesc *) arg;
 
-	/* Buffer is pinned, so we can read the tag without locking the spinlock */
+	/* Buffer is pinned, so we can read the tag without locking the header */
 	if (bufHdr != NULL)
 	{
 		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
@@ -3724,3 +3791,34 @@ rnode_comparator(const void *p1, const void *p2)
 	else
 		return 0;
 }
+
+uint32
+LockBufHdr(volatile BufferDesc *desc)
+{
+	uint32 state = pg_atomic_read_u32(&desc->state);
+
+	for (;;)
+	{
+		/* wait till lock is free */
+		while (state & BM_LOCKED)
+		{
+			pg_spin_delay();
+			state = pg_atomic_read_u32(&desc->state);
+
+			/* Add exponential backoff? Should seldomly be contended tho. */
+		}
+
+		/* and try to get lock */
+		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+			break;
+	}
+	return state | BM_LOCKED;
+}
+
+void
+UnlockBufHdr(volatile BufferDesc *desc)
+{
+	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+
+	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 551d152..148955f 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -98,7 +98,8 @@ typedef struct BufferAccessStrategyData
 
 
 /* Prototypes for internal functions */
-static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
+static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
+											  uint32 *lockstate);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 				BufferDesc *buf);
 
@@ -180,7 +181,7 @@ ClockSweepTick(void)
  *	return the buffer with the buffer header spinlock still held.
  */
 BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy)
+StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
 {
 	BufferDesc *buf;
 	int			bgwprocno;
@@ -192,7 +193,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	 */
 	if (strategy != NULL)
 	{
-		buf = GetBufferFromRing(strategy);
+		buf = GetBufferFromRing(strategy, lockstate);
 		if (buf != NULL)
 			return buf;
 	}
@@ -250,6 +251,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	{
 		while (true)
 		{
+			uint32	state;
+
 			/* Acquire the spinlock to remove element from the freelist */
 			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
@@ -279,11 +282,13 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * it before we got to it.  It's probably impossible altogether as
 			 * of 8.3, but we'd better check anyway.)
 			 */
-			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(state) == 0
+				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*lockstate = state;
 				return buf;
 			}
 			UnlockBufHdr(buf);
@@ -295,6 +300,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	trycounter = NBuffers;
 	for (;;)
 	{
+		uint32	state;
 
 		buf = GetBufferDescriptor(ClockSweepTick());
 
@@ -302,12 +308,14 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0)
+		state = LockBufHdr(buf);
+
+		if (BUF_STATE_GET_REFCOUNT(state) == 0)
 		{
-			if (buf->usage_count > 0)
+			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
 			{
-				buf->usage_count--;
+				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
+
 				trycounter = NBuffers;
 			}
 			else
@@ -315,6 +323,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 				/* Found a usable buffer */
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*lockstate = state;
 				return buf;
 			}
 		}
@@ -585,10 +594,11 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
  * The bufhdr spin lock is held on the returned buffer.
  */
 static BufferDesc *
-GetBufferFromRing(BufferAccessStrategy strategy)
+GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
 {
 	BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		state;
 
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
@@ -616,10 +626,12 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 * shouldn't re-use it.
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
-	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	state = LockBufHdr(buf);
+	if (BUF_STATE_GET_REFCOUNT(state) == 0
+		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
 	{
 		strategy->current_was_in_ring = true;
+		*lockstate = state;
 		return buf;
 	}
 	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 17640cf..edc0ada 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -108,6 +108,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	int			b;
 	int			trycounter;
 	bool		found;
+	uint32		state;
 
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
@@ -128,16 +129,21 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
 				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
 #endif
+		state = pg_atomic_read_u32(&bufHdr->state);
+
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
+			{
+				state += BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
+			}
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
-		if (bufHdr->flags & BM_VALID)
+		if (state & BM_VALID)
 			*foundPtr = TRUE;
 		else
 		{
@@ -169,9 +175,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			state = pg_atomic_read_u32(&bufHdr->state);
+
+			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
 			{
-				bufHdr->usage_count--;
+				state -= BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
 				trycounter = NLocBuffer;
 			}
 			else
@@ -193,7 +202,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * this buffer is not referenced but it might still be dirty. if that's
 	 * the case, write it out before reusing it!
 	 */
-	if (bufHdr->flags & BM_DIRTY)
+	if (state & BM_DIRTY)
 	{
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
@@ -211,7 +220,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 				  false);
 
 		/* Mark not-dirty now in case we error out below */
-		bufHdr->flags &= ~BM_DIRTY;
+		state &= ~BM_DIRTY;
+		pg_atomic_write_u32(&bufHdr->state, state);
 
 		pgBufferUsage.local_blks_written++;
 	}
@@ -228,7 +238,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	/*
 	 * Update the hash table: remove old entry, if any, and make new one.
 	 */
-	if (bufHdr->flags & BM_TAG_VALID)
+	if (state & BM_TAG_VALID)
 	{
 		hresult = (LocalBufferLookupEnt *)
 			hash_search(LocalBufHash, (void *) &bufHdr->tag,
@@ -237,7 +247,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			elog(ERROR, "local buffer hash table corrupted");
 		/* mark buffer invalid just in case hash insert fails */
 		CLEAR_BUFFERTAG(bufHdr->tag);
-		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
+		state &= ~(BM_VALID | BM_TAG_VALID);
+		pg_atomic_write_u32(&bufHdr->state, state);
 	}
 
 	hresult = (LocalBufferLookupEnt *)
@@ -250,9 +261,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * it's all ours now.
 	 */
 	bufHdr->tag = newTag;
-	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	state |= BM_TAG_VALID;
+	state &= ~BUF_USAGECOUNT_MASK;
+	state += BUF_USAGECOUNT_ONE;
+	pg_atomic_write_u32(&bufHdr->state, state);
 
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -267,6 +280,7 @@ MarkLocalBufferDirty(Buffer buffer)
 {
 	int			bufid;
 	BufferDesc *bufHdr;
+	uint32		state;
 
 	Assert(BufferIsLocal(buffer));
 
@@ -280,10 +294,10 @@ MarkLocalBufferDirty(Buffer buffer)
 
 	bufHdr = GetLocalBufferDescriptor(bufid);
 
-	if (!(bufHdr->flags & BM_DIRTY))
-		pgBufferUsage.local_blks_dirtied++;
+	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
 
-	bufHdr->flags |= BM_DIRTY;
+	if (!(state & BM_DIRTY))
+		pgBufferUsage.local_blks_dirtied++;
 }
 
 /*
@@ -307,8 +321,11 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
@@ -327,8 +344,9 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
@@ -349,8 +367,11 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
+
+		state = pg_atomic_read_u32(&bufHdr->state);
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
 		{
 			if (LocalRefCount[i] != 0)
@@ -367,8 +388,9 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index cbc4843..e7d3f66 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -20,29 +20,43 @@
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
+#include "port/atomics.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
 
 
 /*
+ * State is:
+ * 10 bit flags
+ * 4 bit usage count
+ * 18 bit refcount
+ */
+#define BUF_REFCOUNT_ONE 1
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+#define BUF_FLAG_MASK 0xFFC00000U
+
+/*
  * Flags for buffer descriptors
  *
  * Note: TAG_VALID essentially means that there is a buffer hashtable
  * entry associated with the buffer's tag.
  */
-#define BM_DIRTY				(1 << 0)		/* data needs writing */
-#define BM_VALID				(1 << 1)		/* data is valid */
-#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
-#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
-#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
-#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
-#define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
-#define BM_PERMANENT			(1 << 8)		/* permanent relation (not
+#define BM_LOCKED				(1U << 22)		/* buffer header is locked */
+#define BM_DIRTY				(1U << 23)		/* data needs writing */
+#define BM_VALID				(1U << 24)		/* data is valid */
+#define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
+#define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
+#define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
+#define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
+#define BM_PERMANENT			(1U << 31)		/* permanent relation (not
 												 * unlogged) */
-
-typedef bits16 BufFlags;
-
 /*
  * The maximum allowed value of usage_count represents a tradeoff between
  * accuracy and speed of the clock-sweep buffer management algorithm.  A
@@ -141,10 +155,9 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint8		usage_count;	/* usage counter for clock sweep code */
-	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
+
+	/* state of the tag, containing flags, refcount and usagecount */
+	pg_atomic_uint32 state;
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
 	int			buf_id;			/* buffer's index number (from 0) */
@@ -201,11 +214,11 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 #define FREENEXT_NOT_IN_LIST	(-2)
 
 /*
- * Macros for acquiring/releasing a shared buffer header's spinlock.
- * Do not apply these to local buffers!
+ * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
+ * not apply these to local buffers! FIXUP!
  */
-#define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
-#define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+extern uint32 LockBufHdr(volatile BufferDesc *desc);
+extern void UnlockBufHdr(volatile BufferDesc *desc);
 
 
 /* in buf_init.c */
@@ -220,7 +233,8 @@ extern BufferDesc *LocalBufferDescriptors;
  */
 
 /* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+											  uint32 *state);
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 BufferDesc *buf);
#43Michael Paquier
michael.paquier@gmail.com
In reply to: Dilip Kumar (#42)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Jan 19, 2016 at 7:31 PM, Dilip Kumar wrote:

Test2:
pgbench -i -s 300 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

Test3:
pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

It seems like you did a copy-paste of the results with s=100 and
s=300. Both are showing the exact same numbers.
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#44Dilip Kumar
dilipbalaut@gmail.com
In reply to: Michael Paquier (#43)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Jan 19, 2016 at 5:44 PM, Michael Paquier <michael.paquier@gmail.com>
wrote:

Test3:
pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

It seems like you did a copy-paste of the results with s=100 and
s=300. Both are showing the exact same numbers.

Oops, my mistake, re-pasting the correct results for s=100

pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20548 20791
32 372633 355356
64 532052 552148
128 412755 478826
256 346701 372057

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#45Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#44)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi, Dilip!

On Tue, Jan 19, 2016 at 6:00 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Tue, Jan 19, 2016 at 5:44 PM, Michael Paquier <
michael.paquier@gmail.com> wrote:

Test3:
pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

It seems like you did a copy-paste of the results with s=100 and
s=300. Both are showing the exact same numbers.

Oops, my mistake, re-pasting the correct results for s=100

pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20548 20791
32 372633 355356
64 532052 552148
128 412755 478826
256 346701 372057

Could you please re-run these tests few times?
Just to be sure it's a reproducible regression with s=300 and not a
statistical error.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#46Merlin Moncure
mmoncure@gmail.com
In reply to: Alexander Korotkov (#45)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Jan 29, 2016 at 3:17 PM, Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

Hi, Dilip!

On Tue, Jan 19, 2016 at 6:00 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Tue, Jan 19, 2016 at 5:44 PM, Michael Paquier
<michael.paquier@gmail.com> wrote:

Test3:
pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20555 19404
32 375919 332670
64 509067 440680
128 431346 415121
256 380926 379176

It seems like you did a copy-paste of the results with s=100 and
s=300. Both are showing the exact same numbers.

Oops, my mistake, re-pasting the correct results for s=100

pgbench -i -s 100 postgres
pgbench -c$ -j$ -Mprepared -S postgres

Client Base Pached

1 20548 20791
32 372633 355356
64 532052 552148
128 412755 478826
256 346701 372057

Could you please re-run these tests few times?
Just to be sure it's a reproducible regression with s=300 and not a
statistical error.

Probably want to run for at least 5 minutes via -T 300

merlin

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#47Dilip Kumar
dilipbalaut@gmail.com
In reply to: Merlin Moncure (#46)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Jan 30, 2016 at 3:05 AM, Merlin Moncure <mmoncure@gmail.com> wrote:

Probably want to run for at least 5 minutes via -T 300

Last time i run for 5 minutes and taken median of three runs, just missed
mentioning "-T 300" in the mail..

By looking at the results with scale factor 1000 and 100 i don't see any
reason why it will regress with scale factor 300.

So I will run the test again with scale factor 300 and this time i am
planning to run 2 cases.
1. when data fits in shared buffer
2. when data doesn't fit in shared buffer.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#48Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#47)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Jan 31, 2016 at 11:44 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

By looking at the results with scale factor 1000 and 100 i don't see any
reason why it will regress with scale factor 300.

So I will run the test again with scale factor 300 and this time i am
planning to run 2 cases.
1. when data fits in shared buffer
2. when data doesn't fit in shared buffer.

I have run the test again with 300 S.F and found no regression, in fact
there is improvement with the patch like we saw with 1000 scale factor.

Shared Buffer= 8GB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#49Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#48)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Feb 1, 2016 at 7:05 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Sun, Jan 31, 2016 at 11:44 AM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

By looking at the results with scale factor 1000 and 100 i don't see any
reason why it will regress with scale factor 300.

So I will run the test again with scale factor 300 and this time i am
planning to run 2 cases.
1. when data fits in shared buffer
2. when data doesn't fit in shared buffer.

I have run the test again with 300 S.F and found no regression, in fact
there is improvement with the patch like we saw with 1000 scale factor.

Shared Buffer= 8GB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

Great, thanks!

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#50Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#49)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Feb 1, 2016 at 11:34 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Mon, Feb 1, 2016 at 7:05 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Sun, Jan 31, 2016 at 11:44 AM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

By looking at the results with scale factor 1000 and 100 i don't see any
reason why it will regress with scale factor 300.

So I will run the test again with scale factor 300 and this time i am
planning to run 2 cases.
1. when data fits in shared buffer
2. when data doesn't fit in shared buffer.

I have run the test again with 300 S.F and found no regression, in fact
there is improvement with the patch like we saw with 1000 scale factor.

Shared Buffer= 8GB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

Great, thanks!

Attached patch is rebased and have better comments.
Also, there is one comment which survive since original version by Andres.

/* Add exponential backoff? Should seldomly be contended tho. */

Andres, did you mean we should twice the delay with each unsuccessful try
to lock?

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Show quoted text

Attachments:

pinunpin-cas-2.patchapplication/octet-stream; name=pinunpin-cas-2.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index b423aa7..04862d7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 121,132 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 121,129 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 7141eb8..d372d55
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 51,57 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 51,56 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 126,132 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 125,131 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 775,783 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 774,781 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 789,796 ****
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
--- 787,794 ----
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 808,814 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 806,812 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 886,892 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 884,890 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 940,946 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
--- 938,944 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1002,1025 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 1000,1025 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1204,1210 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1204,1210 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1212,1219 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1212,1219 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1234,1245 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
  	UnlockBufHdr(buf);
  
--- 1234,1252 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	pg_atomic_fetch_and_u32(&buf->state,
! 							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
! 							  BM_PERMANENT |
! 							  BUF_USAGECOUNT_MASK));
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID | BM_PERMANENT |
! 							   BUF_USAGECOUNT_ONE);
  	else
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID |
! 							   BUF_USAGECOUNT_ONE);
  
  	UnlockBufHdr(buf);
  
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1269,1275 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1276,1282 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1288,1296 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1295,1304 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1312,1318 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1320,1326 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1331,1337 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1339,1345 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1346,1355 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
  
  	UnlockBufHdr(buf);
  
--- 1354,1362 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
  
  	UnlockBufHdr(buf);
  
*************** void
*** 1383,1388 ****
--- 1390,1396 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1399,1412 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1407,1420 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1414,1420 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
--- 1422,1428 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1456,1462 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1464,1470 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1484,1490 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1492,1498 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1497,1519 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1505,1544 ----
  
  	if (ref == NULL)
  	{
+ 		uint32 state;
+ 		uint32 oldstate;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1529,1537 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1554,1562 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1542,1548 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1567,1573 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1556,1566 ****
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
--- 1581,1591 ----
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	pg_atomic_fetch_add_u32(&buf->state, 1);
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1596,1625 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
! 
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1621,1660 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32 state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1637,1642 ****
--- 1672,1678 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_to_write;
*************** BufferSync(int flags)
*** 1677,1690 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  			num_to_write++;
  		}
  
--- 1713,1727 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
! 			pg_atomic_fetch_or_u32(&bufHdr->state,
! 								   BM_CHECKPOINT_NEEDED);
  			num_to_write++;
  		}
  
*************** BufferSync(int flags)
*** 1723,1729 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
--- 1760,1766 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2083,2088 ****
--- 2120,2126 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  
  	ReservePrivateRefCountEntry();
  
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2095,2104 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2133,2145 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2106,2112 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2147,2153 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2258,2263 ****
--- 2299,2305 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2275,2286 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2317,2329 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2335,2341 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2378,2384 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2358,2364 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2401,2407 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2426,2432 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
  
  	/*
--- 2469,2475 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
  	UnlockBufHdr(buf);
  
  	/*
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2446,2452 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2489,2495 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2534,2546 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2577,2589 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2640,2646 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2683,2689 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2738,2744 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2781,2787 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2780,2786 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2823,2829 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 2876,2882 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 2919,2926 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 2897,2903 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 2941,2947 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 2925,2931 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 2969,2976 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 2977,2983 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3022,3029 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3109,3127 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3155,3174 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3132,3138 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3179,3185 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3172,3179 ****
  		}
  
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3219,3230 ----
  		}
  
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3193,3199 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
--- 3244,3252 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
! 
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
*************** UnlockBuffers(void)
*** 3231,3239 ****
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
  		UnlockBufHdr(buf);
  
--- 3284,3292 ----
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
  
  		UnlockBufHdr(buf);
  
*************** LockBufferForCleanup(Buffer buffer)
*** 3328,3352 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 3381,3408 ----
  
  	for (;;)
  	{
+ 		int		state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
*************** LockBufferForCleanup(Buffer buffer)
*** 3373,3381 ****
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
--- 3429,3437 ----
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
*************** bool
*** 3417,3438 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3473,3498 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3440,3448 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3500,3510 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3480,3496 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3542,3558 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3518,3523 ****
--- 3580,3587 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3528,3536 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3592,3600 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3546,3552 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3610,3616 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3554,3560 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
  
  	UnlockBufHdr(buf);
  
--- 3618,3624 ----
  		return false;
  	}
  
! 	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
  
  	UnlockBufHdr(buf);
  
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3584,3598 ****
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
  
--- 3648,3666 ----
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
! 
! 	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
! 	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
! 		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
! 
! 	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
  
  	UnlockBufHdr(buf);
  
*************** AbortBufferIO(void)
*** 3617,3622 ****
--- 3685,3691 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3625,3650 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3694,3717 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3668,3674 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3735,3741 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3724,3726 ****
--- 3791,3830 ----
  	else
  		return 0;
  }
+ 
+ /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			pg_spin_delay();
+ 			state = pg_atomic_read_u32(&desc->state);
+ 
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	return state | BM_LOCKED;
+ }
+ 
+ /*
+  * Unlock buffer header - unset BM_LOCKED in buffer state.
+  */
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..148955f
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 594,604 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 626,637 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index cbc4843..1da0b16
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 20,48 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 20,68 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 112,129 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 132,153 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 141,152 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 165,175 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 201,211 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /* in buf_init.c */
--- 224,234 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /* in buf_init.c */
*************** extern BufferDesc *LocalBufferDescriptor
*** 220,226 ****
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 243,250 ----
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
#51Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Alexander Korotkov (#50)
Re: Move PinBuffer and UnpinBuffer to atomics

Alexander Korotkov wrote:

Attached patch is rebased and have better comments.
Also, there is one comment which survive since original version by Andres.

/* Add exponential backoff? Should seldomly be contended tho. */

Andres, did you mean we should twice the delay with each unsuccessful try
to lock?

This is probably a tough patch to review; trying to break it with low
number of shared buffers and high concurrency might be an interesting
exercise.

I know Andres is already pretty busy with the checkpoint flush patch and
I very much doubt he will be able to give this patch a lot of attention
in the short term. Moving to next CF.

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#52Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#50)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-02-01 13:06:57 +0300, Alexander Korotkov wrote:

On Mon, Feb 1, 2016 at 11:34 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Attached patch is rebased and have better comments.
Also, there is one comment which survive since original version by Andres.

/* Add exponential backoff? Should seldomly be contended tho. */

Andres, did you mean we should twice the delay with each unsuccessful try
to lock?

Spinning on a lock as fast as possible leads to rapid cacheline bouncing
without anybody making progress. See s_lock() in s_lock.c.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#53Andres Freund
andres@anarazel.de
In reply to: Alvaro Herrera (#51)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-02-01 22:35:06 +0100, Alvaro Herrera wrote:

I know Andres is already pretty busy with the checkpoint flush patch and
I very much doubt he will be able to give this patch a lot of attention
in the short term. Moving to next CF.

Yea, there's no realistic chance I'll be able to take care of this in
the next couple days.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#54Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#52)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Feb 2, 2016 at 12:43 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-02-01 13:06:57 +0300, Alexander Korotkov wrote:

On Mon, Feb 1, 2016 at 11:34 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Interesting. I'll try to reproduce it.

Attached patch is rebased and have better comments.
Also, there is one comment which survive since original version by

Andres.

/* Add exponential backoff? Should seldomly be contended tho. */

Andres, did you mean we should twice the delay with each unsuccessful try
to lock?

Spinning on a lock as fast as possible leads to rapid cacheline bouncing
without anybody making progress. See s_lock() in s_lock.c.

I didn't notice that s_lock() behaves so. Thank you.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#55Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#54)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-02-02 13:12:50 +0300, Alexander Korotkov wrote:

On Tue, Feb 2, 2016 at 12:43 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-02-01 13:06:57 +0300, Alexander Korotkov wrote:

On Mon, Feb 1, 2016 at 11:34 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Interesting. I'll try to reproduce it.

Any progress here?

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#56Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#55)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Feb 27, 2016 at 5:14 AM, Andres Freund <andres@anarazel.de> wrote:

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Interesting. I'll try to reproduce it.

Any progress here?

In Multi socket machine with 8 sockets and 64 cores, I have seen more
regression compared to my previous run in power8 with 2 socket, currently I
tested Read only workload for 5 mins Run, When I get time, I will run for
longer time and confirm again.

Shared Buffer= 8GB
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres
client base patch
1 7057 5230
2 10043 9573
4 20140 18188

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#57Amit Kapila
amit.kapila16@gmail.com
In reply to: Dilip Kumar (#56)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Feb 28, 2016 at 9:05 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Sat, Feb 27, 2016 at 5:14 AM, Andres Freund <andres@anarazel.de> wrote:

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Interesting. I'll try to reproduce it.

Any progress here?

In Multi socket machine with 8 sockets and 64 cores, I have seen more

regression compared to my previous run in power8 with 2 socket, currently I
tested Read only workload for 5 mins Run, When I get time, I will run for
longer time and confirm again.

Have you tried by reverting the commits 6150a1b0 and ac1d794, which I think
effects read-only performance and sometimes create variation in TPS across
different runs, here second might have less impact, but first one could
impact performance? Is it possible for you to get perf data with and
without patch and share with others?

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#58Dilip Kumar
dilipbalaut@gmail.com
In reply to: Amit Kapila (#57)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Feb 29, 2016 at 8:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

Have you tried by reverting the commits 6150a1b0 and ac1d794, which I
think effects read-only performance and sometimes create variation in TPS
across different runs, here second might have less impact, but first one
could impact performance? Is it possible for you to get perf data with and
without patch and share with others?

I only reverted ac1d794 commit in my test, In my next run I will revert
6150a1b0 also and test.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#59Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#55)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Feb 27, 2016 at 2:44 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-02-02 13:12:50 +0300, Alexander Korotkov wrote:

On Tue, Feb 2, 2016 at 12:43 AM, Andres Freund <andres@anarazel.de>

wrote:

On 2016-02-01 13:06:57 +0300, Alexander Korotkov wrote:

On Mon, Feb 1, 2016 at 11:34 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Client Base Patch
1 19744 19382
8 125923 126395
32 313931 333351
64 387339 496830
128 306412 350610

Shared Buffer= 512MB
max_connections=150
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

So, there's a small regression on low client counts. That's worth
addressing.

Interesting. I'll try to reproduce it.

Any progress here?

I didn't reproduce the regression. I had access to multicore machine but
didn't see either regression on low clients or improvements on high clients.
In the attached path spinlock delay was exposed in s_lock.h and used
in LockBufHdr().
Dilip, could you try this version of patch? Could you also run perf or
other profiler in the case of regression. It would be nice to compare
profiles with and without patch. We probably could find the cause of
regression.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-3.patchapplication/octet-stream; name=pinunpin-cas-3.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..35b5ee9
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,175 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
--- 161,176 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
new file mode 100644
index ac709d1..5ca681c
*** a/src/backend/libpq/be-secure.c
--- b/src/backend/libpq/be-secure.c
***************
*** 35,41 ****
  #include "miscadmin.h"
  #include "tcop/tcopprot.h"
  #include "utils/memutils.h"
- #include "storage/ipc.h"
  #include "storage/proc.h"
  
  
--- 35,40 ----
*************** retry:
*** 145,175 ****
  		Assert(waitfor);
  
  		w = WaitLatchOrSocket(MyLatch,
! 							  WL_LATCH_SET | WL_POSTMASTER_DEATH | waitfor,
  							  port->sock, 0);
  
- 		/*
- 		 * If the postmaster has died, it's not safe to continue running,
- 		 * because it is the postmaster's job to kill us if some other backend
- 		 * exists uncleanly.  Moreover, we won't run very well in this state;
- 		 * helper processes like walwriter and the bgwriter will exit, so
- 		 * performance may be poor.  Finally, if we don't exit, pg_ctl will
- 		 * be unable to restart the postmaster without manual intervention,
- 		 * so no new connections can be accepted.  Exiting clears the deck
- 		 * for a postmaster restart.
- 		 *
- 		 * (Note that we only make this check when we would otherwise sleep
- 		 * on our latch.  We might still continue running for a while if the
- 		 * postmaster is killed in mid-query, or even through multiple queries
- 		 * if we never have to wait for read.  We don't want to burn too many
- 		 * cycles checking for this very rare condition, and this should cause
- 		 * us to exit quickly in most cases.)
- 		 */
- 		if (w & WL_POSTMASTER_DEATH)
- 			ereport(FATAL,
- 					(errcode(ERRCODE_ADMIN_SHUTDOWN),
- 					errmsg("terminating connection due to unexpected postmaster exit")));
- 
  		/* Handle interrupt. */
  		if (w & WL_LATCH_SET)
  		{
--- 144,152 ----
  		Assert(waitfor);
  
  		w = WaitLatchOrSocket(MyLatch,
! 							  WL_LATCH_SET | waitfor,
  							  port->sock, 0);
  
  		/* Handle interrupt. */
  		if (w & WL_LATCH_SET)
  		{
*************** retry:
*** 246,260 ****
  		Assert(waitfor);
  
  		w = WaitLatchOrSocket(MyLatch,
! 							  WL_LATCH_SET | WL_POSTMASTER_DEATH | waitfor,
  							  port->sock, 0);
  
- 		/* See comments in secure_read. */
- 		if (w & WL_POSTMASTER_DEATH)
- 			ereport(FATAL,
- 					(errcode(ERRCODE_ADMIN_SHUTDOWN),
- 					errmsg("terminating connection due to unexpected postmaster exit")));
- 
  		/* Handle interrupt. */
  		if (w & WL_LATCH_SET)
  		{
--- 223,231 ----
  		Assert(waitfor);
  
  		w = WaitLatchOrSocket(MyLatch,
! 							  WL_LATCH_SET | waitfor,
  							  port->sock, 0);
  
  		/* Handle interrupt. */
  		if (w & WL_LATCH_SET)
  		{
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index f013a4d..833dc5a
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 121,132 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 121,129 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 68cf5cc..1edcd47
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 51,57 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 51,56 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 126,132 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 125,131 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 775,783 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 774,781 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 789,796 ****
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
--- 787,794 ----
  			do
  			{
  				LockBufHdr(bufHdr);
! 				Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 808,814 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 806,812 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 886,892 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 884,890 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 940,946 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
--- 938,944 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1002,1025 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 1000,1025 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1204,1210 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1204,1210 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1212,1219 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1212,1219 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1234,1245 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
  	UnlockBufHdr(buf);
  
--- 1234,1252 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	pg_atomic_fetch_and_u32(&buf->state,
! 							~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 							  BM_CHECKPOINT_NEEDED | BM_IO_ERROR |
! 							  BM_PERMANENT |
! 							  BUF_USAGECOUNT_MASK));
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID | BM_PERMANENT |
! 							   BUF_USAGECOUNT_ONE);
  	else
! 		pg_atomic_fetch_or_u32(&buf->state,
! 							   BM_TAG_VALID |
! 							   BUF_USAGECOUNT_ONE);
  
  	UnlockBufHdr(buf);
  
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1269,1275 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1276,1282 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1288,1296 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1295,1304 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1312,1318 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1320,1326 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1331,1337 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1339,1345 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1346,1355 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
  
  	UnlockBufHdr(buf);
  
--- 1354,1362 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	pg_atomic_fetch_and_u32(&buf->state, BM_LOCKED | ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK));
  
  	UnlockBufHdr(buf);
  
*************** void
*** 1383,1388 ****
--- 1390,1396 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1399,1412 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1407,1420 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1414,1420 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
--- 1422,1428 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
  
  	UnlockBufHdr(bufHdr);
  }
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1456,1462 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1464,1470 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1484,1490 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1492,1498 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1497,1519 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1505,1544 ----
  
  	if (ref == NULL)
  	{
+ 		uint32 state;
+ 		uint32 oldstate;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				pg_spin_delay();
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1529,1537 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1554,1562 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1542,1548 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1567,1573 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1556,1566 ****
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
--- 1581,1591 ----
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	pg_atomic_fetch_add_u32(&buf->state, 1);
  	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1596,1625 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
! 
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1621,1660 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32 state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
! 		state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1637,1642 ****
--- 1672,1678 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_to_write;
*************** BufferSync(int flags)
*** 1677,1690 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  			num_to_write++;
  		}
  
--- 1713,1727 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
! 			pg_atomic_fetch_or_u32(&bufHdr->state,
! 								   BM_CHECKPOINT_NEEDED);
  			num_to_write++;
  		}
  
*************** BufferSync(int flags)
*** 1723,1729 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
--- 1760,1766 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2083,2088 ****
--- 2120,2126 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  
  	ReservePrivateRefCountEntry();
  
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2095,2104 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2133,2145 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2106,2112 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2147,2153 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2258,2263 ****
--- 2299,2305 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2275,2286 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2317,2329 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2335,2341 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2378,2384 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2358,2364 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2401,2407 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2426,2432 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
  
  	/*
--- 2469,2475 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	pg_atomic_fetch_and_u32(&buf->state, ~BM_JUST_DIRTIED);
  	UnlockBufHdr(buf);
  
  	/*
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2446,2452 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2489,2495 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2534,2546 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2577,2589 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2640,2646 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2683,2689 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2738,2744 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2781,2787 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2780,2786 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2823,2829 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 2876,2882 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 2919,2926 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 2897,2903 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 2941,2947 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 2925,2931 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 2969,2976 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 2977,2983 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3022,3029 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 			== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3109,3127 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3155,3174 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3132,3138 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3179,3185 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3172,3179 ****
  		}
  
  		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3219,3230 ----
  		}
  
  		LockBufHdr(bufHdr);
! 
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3193,3199 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
--- 3244,3252 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY | BM_JUST_DIRTIED);
! 
  		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
*************** UnlockBuffers(void)
*** 3231,3239 ****
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
  		UnlockBufHdr(buf);
  
--- 3284,3292 ----
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((pg_atomic_read_u32(&buf->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&buf->state, ~BM_PIN_COUNT_WAITER);
  
  		UnlockBufHdr(buf);
  
*************** LockBufferForCleanup(Buffer buffer)
*** 3328,3352 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- 3381,3408 ----
  
  	for (;;)
  	{
+ 		int		state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_PIN_COUNT_WAITER);
  		PinCountWaitBuf = bufHdr;
  		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
*************** LockBufferForCleanup(Buffer buffer)
*** 3373,3381 ****
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
--- 3429,3437 ----
  		 * better be safe.
  		 */
  		LockBufHdr(bufHdr);
! 		if ((pg_atomic_read_u32(&bufHdr->state) & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_PIN_COUNT_WAITER);
  		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
*************** bool
*** 3417,3438 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3473,3498 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3440,3448 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3500,3510 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3480,3496 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3542,3558 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3518,3523 ****
--- 3580,3587 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3528,3536 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3592,3600 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3546,3552 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3610,3616 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3554,3560 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
  
  	UnlockBufHdr(buf);
  
--- 3618,3624 ----
  		return false;
  	}
  
! 	pg_atomic_fetch_or_u32(&buf->state, BM_IO_IN_PROGRESS);
  
  	UnlockBufHdr(buf);
  
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3584,3598 ****
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
  
--- 3648,3666 ----
  static void
  TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
! 
! 	pg_atomic_fetch_and_u32(&buf->state, ~(BM_IO_IN_PROGRESS | BM_IO_ERROR));
! 	if (clear_dirty && !(pg_atomic_read_u32(&buf->state) & BM_JUST_DIRTIED))
! 		pg_atomic_fetch_and_u32(&buf->state, ~(BM_DIRTY | BM_CHECKPOINT_NEEDED));
! 
! 	pg_atomic_fetch_or_u32(&buf->state, set_flag_bits);
  
  	UnlockBufHdr(buf);
  
*************** AbortBufferIO(void)
*** 3617,3622 ****
--- 3685,3691 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3625,3650 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3694,3717 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3668,3674 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3735,3741 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3724,3726 ****
--- 3791,3834 ----
  	else
  		return 0;
  }
+ 
+ /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	SpinDelayStatus delayStatus;
+ 
+ 	init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+ 
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			make_spin_delay(&delayStatus);
+ 			state = pg_atomic_read_u32(&desc->state);
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return state | BM_LOCKED;
+ }
+ 
+ /*
+  * Unlock buffer header - unset BM_LOCKED in buffer state.
+  */
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..148955f
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				pg_atomic_fetch_sub_u32(&buf->state, BUF_USAGECOUNT_ONE);
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 594,604 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 626,637 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..a2edf00
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
! 	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
  
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,150 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line)
! {
! 	status->spins = 0;
! 	status->delays = 0;
! 	status->cur_delay = 0;
! 	status->ptr = ptr;
! 	status->file = file;
! 	status->line = line;
! }
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
! 	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 155,180 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus;
! 
! 	init_spin_delay(&delayStatus, (Pointer)lock, file, line);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index cbc4843..1da0b16
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 20,48 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 20,68 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 112,129 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 132,153 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 141,152 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 165,175 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 201,211 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /* in buf_init.c */
--- 224,234 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /* in buf_init.c */
*************** extern BufferDesc *LocalBufferDescriptor
*** 220,226 ****
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 243,250 ----
   */
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..cc6c195
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1012 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ void init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line);
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#60Dilip Kumar
dilipbalaut@gmail.com
In reply to: Alexander Korotkov (#59)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Feb 29, 2016 at 5:18 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I didn't reproduce the regression. I had access to multicore machine but
didn't see either regression on low clients or improvements on high clients.
In the attached path spinlock delay was exposed in s_lock.h and used
in LockBufHdr().
Dilip, could you try this version of patch? Could you also run perf or
other profiler in the case of regression. It would be nice to compare
profiles with and without patch. We probably could find the cause of
regression.

OK, I will test it, sometime in this week.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#61Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#60)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 1, 2016 at 10:19 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

OK, I will test it, sometime in this week.

I have tested this patch in my laptop, and there i did not see any
regression at 1 client

Shared buffer 10GB, 5 mins run with pgbench, read-only test

base patch
run1 22187 24334
run2 26288 27440
run3 26306 27411

May be in a day or 2 I will test it in the same machine where I reported
the regression, and if regression is there I will check the instruction
using Call grind.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#62Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#61)
Re: Move PinBuffer and UnpinBuffer to atomics

On March 1, 2016 8:41:33 PM PST, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Tue, Mar 1, 2016 at 10:19 AM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

OK, I will test it, sometime in this week.

I have tested this patch in my laptop, and there i did not see any
regression at 1 client

Shared buffer 10GB, 5 mins run with pgbench, read-only test

base patch
run1 22187 24334
run2 26288 27440
run3 26306 27411

May be in a day or 2 I will test it in the same machine where I
reported
the regression, and if regression is there I will check the instruction
using Call grind.

Sounds like a ppc vs. x86 issue. The regression was on the former, right?

Andres
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#63Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#62)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Mar 2, 2016 at 10:39 AM, Andres Freund <andres@anarazel.de> wrote:

Sounds like a ppc vs. x86 issue. The regression was on the former, right?

Well, Regression what I reported last two time, out of that one was on X86
and other was on PPC.

Copied from older Threads
--------------------------------------
On PPC

./pgbench -j$ -c$ -T300 -M prepared -S postgres

Client Base Patch
1 17169 16454
8 108547 105559
32 241619 262818
64 206868 233606
128 137084 217013

On X86

Shared Buffer= 8GB
Scale Factor=300

./pgbench -j$ -c$ -T300 -M prepared -S postgres
client base patch
1 7057 5230
2 10043 9573
4 20140 18188

And this latest result (no regression) is on X86 but on my local machine.

I did not exactly saw what this new version of patch is doing different, so
I will test this version in other machines also and see the results.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#64Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#63)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Mar 2, 2016 at 11:05 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

And this latest result (no regression) is on X86 but on my local machine.

I did not exactly saw what this new version of patch is doing different,
so I will test this version in other machines also and see the results.

I tested this on PPC again, This time in various order (sometime patch
first and then base first).
I tested with latest patch *pinunpin-cas-2.patch* on Power8.

Shared Buffer = 8GB
./pgbench -j$ -c$ -T300 -M prepared -S postgres

BASE
-----
Clients run1 run2 run3
1 21200 18754 20537
2 40331 39520 38746

Patch
-----
Clients run1 run2 run3
1 20225 19806 19778
2 39830 41898 36620

I think, here we can not see any regression, (If I take median then it may
looks low with patch so posting all 3 reading).

Note: reverted only ac1d794 commit in my test.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#65Robert Haas
robertmhaas@gmail.com
In reply to: Dilip Kumar (#64)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Mar 5, 2016 at 7:22 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Wed, Mar 2, 2016 at 11:05 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

And this latest result (no regression) is on X86 but on my local machine.

I did not exactly saw what this new version of patch is doing different,
so I will test this version in other machines also and see the results.

I tested this on PPC again, This time in various order (sometime patch first
and then base first).
I tested with latest patch pinunpin-cas-2.patch on Power8.

Shared Buffer = 8GB
./pgbench -j$ -c$ -T300 -M prepared -S postgres

BASE
-----
Clients run1 run2 run3
1 21200 18754 20537
2 40331 39520 38746

Patch
-----
Clients run1 run2 run3
1 20225 19806 19778
2 39830 41898 36620

I think, here we can not see any regression, (If I take median then it may
looks low with patch so posting all 3 reading).

If the median looks low, how is that not a regression?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#66Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Robert Haas (#65)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Mar 7, 2016 at 6:19 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Sat, Mar 5, 2016 at 7:22 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Wed, Mar 2, 2016 at 11:05 AM, Dilip Kumar <dilipbalaut@gmail.com>

wrote:

And this latest result (no regression) is on X86 but on my local

machine.

I did not exactly saw what this new version of patch is doing different,
so I will test this version in other machines also and see the results.

I tested this on PPC again, This time in various order (sometime patch

first

and then base first).
I tested with latest patch pinunpin-cas-2.patch on Power8.

Shared Buffer = 8GB
./pgbench -j$ -c$ -T300 -M prepared -S postgres

BASE
-----
Clients run1 run2 run3
1 21200 18754 20537
2 40331 39520 38746

Patch
-----
Clients run1 run2 run3
1 20225 19806 19778
2 39830 41898 36620

I think, here we can not see any regression, (If I take median then it

may

looks low with patch so posting all 3 reading).

If the median looks low, how is that not a regression?

I don't think we can rely on median that much if we have only 3 runs.
For 3 runs we can only apply Kornfeld method which claims that confidence
interval should be between lower and upper values.
Since confidence intervals for master and patched versions are overlapping
we can't conclude that expected TPS numbers are different.
Dilip, could you do more runs? 10, for example. Using such statistics we
would be able to conclude something.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#67Dilip Kumar
dilipbalaut@gmail.com
In reply to: Alexander Korotkov (#66)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Mar 10, 2016 at 8:26 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I don't think we can rely on median that much if we have only 3 runs.
For 3 runs we can only apply Kornfeld method which claims that confidence
interval should be between lower and upper values.
Since confidence intervals for master and patched versions are overlapping
we can't conclude that expected TPS numbers are different.
Dilip, could you do more runs? 10, for example. Using such statistics we
would be able to conclude something.

Here is the reading for 10 runs....

Median Result
-----------------------

Client Base Patch
-------------------------------------------
1 19873 19739
2 38658 38276
4 68812 62075

Full Results of 10 runs...

Base
-------------
Runs 1 Client 2 Client 4 Client
-----------------------------------------------------
1 19442 34866 49023
2 19605 35139 51695
3 19726 37104 53899
4 19835 38439 55708
5 19866 38638 67473
6 19880 38679 70152
7 19973 38720 70920
8 20048 38737 71342
9 20057 38815 71403
10 20344 41423 77953
-----------------------------------------------------

Patch
-----------
Runs 1 Client 2 Client 4 Client
------------------------------------------------------
1 18881 30161 54928
2 19415 32031 59741
3 19564 35022 61060
4 19627 36712 61839
5 19670 37659 62011
6 19808 38894 62139
7 19857 39081 62983
8 19910 39923 75358
9 20169 39937 77481
10 20181 40003 78462
------------------------------------------------------

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#68Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#67)
3 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Mar 11, 2016 at 7:08 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Thu, Mar 10, 2016 at 8:26 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I don't think we can rely on median that much if we have only 3 runs.
For 3 runs we can only apply Kornfeld method which claims that confidence
interval should be between lower and upper values.
Since confidence intervals for master and patched versions are
overlapping we can't conclude that expected TPS numbers are different.
Dilip, could you do more runs? 10, for example. Using such statistics we
would be able to conclude something.

Here is the reading for 10 runs....

Median Result
-----------------------

Client Base Patch
-------------------------------------------
1 19873 19739
2 38658 38276
4 68812 62075

Full Results of 10 runs...

Base
-------------
Runs 1 Client 2 Client 4 Client
-----------------------------------------------------
1 19442 34866 49023
2 19605 35139 51695
3 19726 37104 53899
4 19835 38439 55708
5 19866 38638 67473
6 19880 38679 70152
7 19973 38720 70920
8 20048 38737 71342
9 20057 38815 71403
10 20344 41423 77953
-----------------------------------------------------

Patch
-----------
Runs 1 Client 2 Client 4 Client
------------------------------------------------------
1 18881 30161 54928
2 19415 32031 59741
3 19564 35022 61060
4 19627 36712 61839
5 19670 37659 62011
6 19808 38894 62139
7 19857 39081 62983
8 19910 39923 75358
9 20169 39937 77481
10 20181 40003 78462
------------------------------------------------------

I've drawn graphs for these measurements. The variation doesn't look random
here. TPS is going higher from measurement to measurement. I bet you did
measurements sequentially.
I think we should do more measurements until TPS stop growing and beyond to
see accumulate average statistics. Could you please do the same tests but
50 runs.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

clients_1.pngimage/png; name=clients_1.pngDownload
clients_2.pngimage/png; name=clients_2.pngDownload
clients_3.pngimage/png; name=clients_3.pngDownload
#69Dilip Kumar
dilipbalaut@gmail.com
In reply to: Alexander Korotkov (#68)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Mar 14, 2016 at 3:09 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I've drawn graphs for these measurements. The variation doesn't look
random here. TPS is going higher from measurement to measurement. I bet
you did measurements sequentially.
I think we should do more measurements until TPS stop growing and beyond
to see accumulate average statistics. Could you please do the same tests
but 50 runs.

I have taken reading at different client count (1,2 and 4)

1. Avg: Is taken average after discarding lowest 5 and highest 5 readings.
2. With 4 client I have only 30 reading.

Summary:

I think if we check avg or median atleast at 1 or 2 client head always
looks winner, but same is not the case with 4 clients. And with 4 clients I
can see much more fluctuation in the reading.

Head(1 Client) Patch (1 Client)
Head (2 Client) Patch (2 Client)
Head (4 Client) Patch (4 Client)
Avg: 19628 19578
37180 36536
70044 70731
Median: 19663 19581
37967 37484
73003 75376
Below is all the reading. (Arranged in sorted order)

*Runs* *Head (1 Client)* *Patch (1 Client)*
*Head (2 Client)* *Patch (2 Client)*
*Head (4 Client)* *Patch (4 Client)*
1 18191 18153
29454 26128
49931 47210
2 18365 18768
31218 26956
53358 47287
3 19053 18769
31808 29286
53575 55458
4 19128 18915
31860 30558
54282 55794
5 19160 18948
32411 30945
56629 56253
6 19177 19055
32453 31316
57595 58147
7 19351 19232
32571 31703
59490 58543
8 19353 19239
33294 32029
63887 58990
9 19361 19255
33936 32217
64718 60233
10 19390 19297
34361 32767
65737 68210
11 19452 19368
34563 34487
65881 71409
12 19478 19387
36110 34907
67151 72108
13 19488 19388
36221 34936
70974 73977
14 19490 19395
36461 35068
72212 74891
15 19512 19406
36712 35298
73003 75376
16 19538 19450
37104 35492
74842 75468
17 19547 19487
37246 36648
75400 75515
18 19592 19521
37567 37263
75573 75626
19 19627 19536
37707 37430
75798 75745
20 19661 19556
37958 37461
75834 75868
21 19666 19607
37976 37507
76240 76161
22 19701 19624
38060 37557
76426 76162
23 19708 19643
38244 37717
76770 76333
24 19748 19684
38272 38285
77011 77009
25 19751 19694
38467 38294
77114 77168
26 19776 19695
38524 38469
77630 77318
27 19781 19709
38625 38642
77865 77550
28 19786 19765
38756 38643
77912 77904
29 19796 19823
38939 38649
78242 78736
30 19826 19847
39139 38659

31 19837 19899
39208 38713

32 19849 19909
39211 38837

33 19854 19932
39230 38876

34 19867 19949
39249 39088

35 19891 19990
39259 39148

36 20038 20085
39286 39453

37 20083 20128
39435 39563

38 20143 20166
39448 39959

39 20191 20198
39475 40495

40 20437 20455
40375 40664

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#70Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#69)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Mar 19, 2016 at 3:22 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Mon, Mar 14, 2016 at 3:09 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I've drawn graphs for these measurements. The variation doesn't look
random here. TPS is going higher from measurement to measurement. I bet
you did measurements sequentially.
I think we should do more measurements until TPS stop growing and beyond
to see accumulate average statistics. Could you please do the same tests
but 50 runs.

I have taken reading at different client count (1,2 and 4)

1. Avg: Is taken average after discarding lowest 5 and highest 5 readings.
2. With 4 client I have only 30 reading.

Summary:

I think if we check avg or median atleast at 1 or 2 client head always
looks winner, but same is not the case with 4 clients. And with 4 clients I
can see much more fluctuation in the reading.

Head(1 Client) Patch (1 Client)
Head (2 Client) Patch (2 Client)
Head (4 Client) Patch (4 Client)
Avg: 19628 19578
37180 36536
70044 70731
Median: 19663 19581
37967 37484
73003 75376
Below is all the reading. (Arranged in sorted order)

*Runs* *Head (1 Client)* *Patch (1 Client)*
*Head (2 Client)* *Patch (2 Client)*
*Head (4 Client)* *Patch (4 Client)*
1 18191 18153
29454 26128
49931 47210
2 18365 18768
31218 26956
53358 47287
3 19053 18769
31808 29286
53575 55458
4 19128 18915
31860 30558
54282 55794
5 19160 18948
32411 30945
56629 56253
6 19177 19055
32453 31316
57595 58147
7 19351 19232
32571 31703
59490 58543
8 19353 19239
33294 32029
63887 58990
9 19361 19255
33936 32217
64718 60233
10 19390 19297
34361 32767
65737 68210
11 19452 19368
34563 34487
65881 71409
12 19478 19387
36110 34907
67151 72108
13 19488 19388
36221 34936
70974 73977
14 19490 19395
36461 35068
72212 74891
15 19512 19406
36712 35298
73003 75376
16 19538 19450
37104 35492
74842 75468
17 19547 19487
37246 36648
75400 75515
18 19592 19521
37567 37263
75573 75626
19 19627 19536
37707 37430
75798 75745
20 19661 19556
37958 37461
75834 75868
21 19666 19607
37976 37507
76240 76161
22 19701 19624
38060 37557
76426 76162
23 19708 19643
38244 37717
76770 76333
24 19748 19684
38272 38285
77011 77009
25 19751 19694
38467 38294
77114 77168
26 19776 19695
38524 38469
77630 77318
27 19781 19709
38625 38642
77865 77550
28 19786 19765
38756 38643
77912 77904
29 19796 19823
38939 38649
78242 78736
30 19826 19847
39139 38659

31 19837 19899
39208 38713

32 19849 19909
39211 38837

33 19854 19932
39230 38876

34 19867 19949
39249 39088

35 19891 19990
39259 39148

36 20038 20085
39286 39453

37 20083 20128
39435 39563

38 20143 20166
39448 39959

39 20191 20198
39475 40495

40 20437 20455
40375 40664

So, I think it's really some regression here on small number clients. I
have following hypothesis about that. In some cases we use more atomic
operations than before. For instace, in BufferAlloc we have following block
of code. Previous code deals with that without atomic operations relying
on spinlock. So, we have 2 extra atomic operations here, and similar
situation in other places.

pg_atomic_fetch_and_u32(&buf->state, ~(BM_VALID | BM_DIRTY |
BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
BUF_USAGECOUNT_MASK)); if (relpersistence == RELPERSISTENCE_PERMANENT)
pg_atomic_fetch_or_u32(&buf->state, BM_TAG_VALID | BM_PERMANENT |
BUF_USAGECOUNT_ONE); else pg_atomic_fetch_or_u32(&buf->state, BM_TAG_VALID
| BUF_USAGECOUNT_ONE);
Actually, we behave like old code and do such modifications without
increasing number of atomic operations. We can just calculate new value of
state (including unset of BM_LOCKED flag) and write it to the buffer. In
this case we don't require more atomic operations. However, it becomes not
safe to use atomic decrement in UnpinBuffer(). We can use there loop of
CAS which wait buffer header to be unlocked like PinBuffer() does. I'm not
sure if this affects multicore scalability, but I think it worth trying.

So, this idea is implemented in attached patch. Please, try it for both
regression on lower number of clients and scalability on large number of
clients.

Other changes in this patch:
1) PinBuffer() and UnpinBuffer() use exponential backoff in spindealy
like LockBufHdr() does.
2) Previous patch contained revert
of ac1d7945f866b1928c2554c0f80fd52d7f977772. This was not intentional.
No, it doesn't contains this revert.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-4.patchapplication/octet-stream; name=pinunpin-cas-4.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..35b5ee9
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,175 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
--- 161,176 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..a839ea6
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 52,58 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 52,57 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 163,169 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 162,168 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 439,445 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 815,823 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 814,821 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 826,834 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				state &= ~(BM_VALID | BM_LOCKED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 845,851 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 930,936 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 984,994 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1050,1072 ****
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 1048,1070 ----
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1252,1258 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1269 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1260,1267 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1282,1296 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_LOCKED | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	pg_atomic_write_u32(&buf->state, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1319,1325 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1318,1324 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1346 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1337,1346 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1362,1368 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1362,1368 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1381,1387 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1381,1387 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1396,1405 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** void
*** 1433,1438 ****
--- 1431,1437 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1449,1462 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1448,1461 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1464,1472 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 
! 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1463,1471 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1506,1512 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1505,1511 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1534,1540 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1533,1539 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1546,1587 ----
  
  	if (ref == NULL)
  	{
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1579,1587 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1597,1605 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1592,1598 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1610,1616 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1617 ****
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1621,1638 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	state += 1;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
! 		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
  		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1667,1731 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
  
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
  
! 			Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 			state -= 1;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 
! 		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
! 		{
! 			state = LockBufHdr(buf);
! 
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1743,1749 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1733,1748 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1790,1805 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1809,1816 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1946,1952 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2316,2322 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2280 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2330,2342 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2282,2288 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2344,2350 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2501,2507 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2519,2531 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2516,2522 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2580,2586 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2539,2545 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2603,2609 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2637,2643 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2663,2669 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2672,2679 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~(BM_JUST_DIRTIED | BM_LOCKED);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2692,2698 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2715,2727 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2780,2792 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2821,2827 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2886,2892 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2919,2925 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2984,2990 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2961,2967 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 3026,3032 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 3057,3063 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3122,3129 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3144,3150 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3159,3166 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3172,3180 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3213,3219 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3225,3233 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3290,3308 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3359,3378 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3383,3389 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3422,3432 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3446,3455 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3480,3499 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3586,3616 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3639,3650 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3685,3710 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3634 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3712,3722 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3754,3770 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3792,3799 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3804,3812 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3732,3738 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3822,3828 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3740,3748 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3830,3838 ----
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3858,3879 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR | BM_LOCKED);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3896,3902 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3836 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3905,3928 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3854,3860 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3946,3952 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4004,4050 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	SpinDelayStatus delayStatus;
+ 
+ 	init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+ 
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			make_spin_delay(&delayStatus);
+ 			state = pg_atomic_read_u32(&desc->state);
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return state | BM_LOCKED;
+ }
+ 
+ /*
+  * Unlock buffer header - unset BM_LOCKED in buffer state.
+  */
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..8b99824
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 628,639 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..a2edf00
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
! 	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
  
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,150 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line)
! {
! 	status->spins = 0;
! 	status->delays = 0;
! 	status->cur_delay = 0;
! 	status->ptr = ptr;
! 	status->file = file;
! 	status->line = line;
! }
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
! 	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 155,180 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus;
! 
! 	init_spin_delay(&delayStatus, (Pointer)lock, file, line);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..b3129c6
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,154 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,153 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 166,176 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 225,235 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 290,297 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..cc6c195
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1012 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ void init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line);
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#71Dilip Kumar
dilipbalaut@gmail.com
In reply to: Alexander Korotkov (#70)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Mar 20, 2016 at 4:10 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Actually, we behave like old code and do such modifications without
increasing number of atomic operations. We can just calculate new value of
state (including unset of BM_LOCKED flag) and write it to the buffer. In
this case we don't require more atomic operations. However, it becomes not
safe to use atomic decrement in UnpinBuffer(). We can use there loop of
CAS which wait buffer header to be unlocked like PinBuffer() does. I'm not
sure if this affects multicore scalability, but I think it worth trying.

So, this idea is implemented in attached patch. Please, try it for both
regression on lower number of clients and scalability on large number of
clients.

Good, seems like we have reduced some instructions, I will test it soon.

Other changes in this patch:
1) PinBuffer() and UnpinBuffer() use exponential backoff in spindealy
like LockBufHdr() does.
2) Previous patch contained revert
of ac1d7945f866b1928c2554c0f80fd52d7f977772. This was not intentional.
No, it doesn't contains this revert.

Some other comments..

*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  */
  do
  {
! LockBufHdr(bufHdr);
! Assert(bufHdr->flags & BM_VALID);
! bufHdr->flags &= ~BM_VALID;
! UnlockBufHdr(bufHdr);
  } while (!StartBufferIO(bufHdr, true));
  }
  }
--- 826,834 ----
  */
  do
  {
! uint32 state = LockBufHdr(bufHdr);
! state &= ~(BM_VALID | BM_LOCKED);
! pg_atomic_write_u32(&bufHdr->state, state);
  } while (!StartBufferIO(bufHdr, true));

Better Write some comment, about we clearing the BM_LOCKED from stage
directly and need not to call UnlockBufHdr explicitly.
otherwise its confusing.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#72Dilip Kumar
dilipbalaut@gmail.com
In reply to: Dilip Kumar (#71)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 22, 2016 at 12:31 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

! pg_atomic_write_u32(&bufHdr->state, state);
} while (!StartBufferIO(bufHdr, true));

Better Write some comment, about we clearing the BM_LOCKED from stage
directly and need not to call UnlockBufHdr explicitly.
otherwise its confusing.

Few more comments..

*** 828,837 ****
    */
   do
   {
!  LockBufHdr(bufHdr);
*!  Assert(bufHdr->flags & BM_VALID);*
!  bufHdr->flags &= ~BM_VALID;
!  UnlockBufHdr(bufHdr);
   } while (!StartBufferIO(bufHdr, true));
   }
   }
--- 826,834 ----
    */
   do
   {
!  uint32 state = LockBufHdr(bufHdr);
!  state &= ~(BM_VALID | BM_LOCKED);
!  pg_atomic_write_u32(&bufHdr->state, state);
   } while (!StartBufferIO(bufHdr, true));

1. Previously there was a Assert, any reason why we removed it ?
Assert(bufHdr->flags & BM_VALID);

2. And also if we don't need assert then can't we directly clear BM_VALID
flag from state variable (if not locked) like pin/unpin buffer does,
without taking buffer header lock?

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#73Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#72)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 22, 2016 at 7:57 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Tue, Mar 22, 2016 at 12:31 PM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

! pg_atomic_write_u32(&bufHdr->state, state);
} while (!StartBufferIO(bufHdr, true));

Better Write some comment, about we clearing the BM_LOCKED from stage
directly and need not to call UnlockBufHdr explicitly.
otherwise its confusing.

Few more comments..

*** 828,837 ****
*/
do
{
!  LockBufHdr(bufHdr);
*!  Assert(bufHdr->flags & BM_VALID);*
!  bufHdr->flags &= ~BM_VALID;
!  UnlockBufHdr(bufHdr);
} while (!StartBufferIO(bufHdr, true));
}
}
--- 826,834 ----
*/
do
{
!  uint32 state = LockBufHdr(bufHdr);
!  state &= ~(BM_VALID | BM_LOCKED);
!  pg_atomic_write_u32(&bufHdr->state, state);
} while (!StartBufferIO(bufHdr, true));

1. Previously there was a Assert, any reason why we removed it ?
Assert(bufHdr->flags & BM_VALID);

It was missed. In the attached patch I've put it back.

2. And also if we don't need assert then can't we directly clear BM_VALID

flag from state variable (if not locked) like pin/unpin buffer does,
without taking buffer header lock?

In this version of patch it could be also done as loop of CAS operation.
But I'm not intended to it so because it would significantly complicate
code. It's not yes clear that traffic in this place is high enough to make
such optimizations.
Since v4 patch implements slightly different approach. Could you please
test it? We need to check that this approach worth putting more efforts on
it. Or through it away otherwise.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-5.patchapplication/octet-stream; name=pinunpin-cas-5.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..35b5ee9
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,175 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
--- 161,176 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..5724168
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 52,58 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 52,57 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 163,169 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 162,168 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 439,445 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 815,823 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 814,821 ----
  		 */
  		if (isLocalBuf)
  		{
! 			Assert(pg_atomic_read_u32(&bufHdr->state) & BM_VALID);
! 			pg_atomic_fetch_and_u32(&bufHdr->state, ~BM_VALID);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 826,835 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~(BM_VALID | BM_LOCKED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 846,852 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 931,937 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		pg_atomic_fetch_or_u32(&bufHdr->state, BM_VALID);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 985,995 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1050,1072 ****
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 1049,1071 ----
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1253,1259 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1269 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1261,1268 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1283,1297 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_LOCKED | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	pg_atomic_write_u32(&buf->state, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1319,1325 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1319,1325 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1346 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1338,1347 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1362,1368 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1363,1369 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1381,1387 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1382,1388 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1397,1406 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** void
*** 1433,1438 ****
--- 1432,1438 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1449,1462 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1449,1462 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1464,1472 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 
! 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1464,1472 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1506,1512 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1506,1512 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1534,1540 ****
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1534,1540 ----
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1547,1588 ----
  
  	if (ref == NULL)
  	{
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1579,1587 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1598,1606 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1592,1598 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1611,1617 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1617 ****
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1622,1639 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	state += 1;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
! 		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
  		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1668,1732 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Arguably it'd be more robust if we checked for BM_LOCKED here, but
! 		 * currently all manipulation of ->state for shared buffers is through
! 		 * atomics.
! 		 */
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
  
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
  
! 			Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 			state -= 1;
! 
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 
! 		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
! 		{
! 			state = LockBufHdr(buf);
! 
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				pg_atomic_fetch_and_u32(&buf->state,
+ 										~BM_PIN_COUNT_WAITER);
+ 				UnlockBufHdr(buf);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1744,1750 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1733,1748 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1791,1806 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1810,1817 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1947,1953 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2317,2323 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2280 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2331,2343 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2282,2288 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2345,2351 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2502,2508 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2520,2532 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2516,2522 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2581,2587 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2539,2545 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2604,2610 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2638,2644 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2664,2670 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2673,2680 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~(BM_JUST_DIRTIED | BM_LOCKED);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2693,2699 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2715,2727 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2781,2793 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2821,2827 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2887,2893 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2919,2925 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2985,2991 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2961,2967 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 3027,3033 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 3057,3063 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3123,3130 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3145,3151 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3160,3167 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3173,3181 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3214,3220 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3226,3234 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3290,3308 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3360,3379 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3384,3390 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3423,3433 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3447,3456 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3481,3500 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3587,3617 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3640,3651 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3686,3711 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3634 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3713,3723 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3755,3771 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3793,3800 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3805,3813 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3732,3738 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3823,3829 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3740,3748 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3831,3839 ----
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3859,3880 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR | BM_LOCKED);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3897,3903 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3836 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3906,3929 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3854,3860 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3947,3953 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4005,4051 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	SpinDelayStatus delayStatus;
+ 
+ 	init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+ 
+ 	uint32 state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			make_spin_delay(&delayStatus);
+ 			state = pg_atomic_read_u32(&desc->state);
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return state | BM_LOCKED;
+ }
+ 
+ /*
+  * Unlock buffer header - unset BM_LOCKED in buffer state.
+  */
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..8b99824
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 628,639 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..a2edf00
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
! 	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
  
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,150 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line)
! {
! 	status->spins = 0;
! 	status->delays = 0;
! 	status->cur_delay = 0;
! 	status->ptr = ptr;
! 	status->file = file;
! 	status->line = line;
! }
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
! 	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 155,180 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus;
! 
! 	init_spin_delay(&delayStatus, (Pointer)lock, file, line);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..b3129c6
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,154 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,153 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 166,176 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 225,235 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 290,297 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..cc6c195
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1012 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ void init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line);
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#74Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#73)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 22, 2016 at 1:08 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Tue, Mar 22, 2016 at 7:57 AM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

On Tue, Mar 22, 2016 at 12:31 PM, Dilip Kumar <dilipbalaut@gmail.com>
wrote:

! pg_atomic_write_u32(&bufHdr->state, state);
} while (!StartBufferIO(bufHdr, true));

Better Write some comment, about we clearing the BM_LOCKED from stage
directly and need not to call UnlockBufHdr explicitly.
otherwise its confusing.

Few more comments..

*** 828,837 ****
*/
do
{
!  LockBufHdr(bufHdr);
*!  Assert(bufHdr->flags & BM_VALID);*
!  bufHdr->flags &= ~BM_VALID;
!  UnlockBufHdr(bufHdr);
} while (!StartBufferIO(bufHdr, true));
}
}
--- 826,834 ----
*/
do
{
!  uint32 state = LockBufHdr(bufHdr);
!  state &= ~(BM_VALID | BM_LOCKED);
!  pg_atomic_write_u32(&bufHdr->state, state);
} while (!StartBufferIO(bufHdr, true));

1. Previously there was a Assert, any reason why we removed it ?
Assert(bufHdr->flags & BM_VALID);

It was missed. In the attached patch I've put it back.

2. And also if we don't need assert then can't we directly clear BM_VALID

flag from state variable (if not locked) like pin/unpin buffer does,
without taking buffer header lock?

In this version of patch it could be also done as loop of CAS operation.
But I'm not intended to it so because it would significantly complicate
code. It's not yes clear that traffic in this place is high enough to make
such optimizations.
Since v4 patch implements slightly different approach. Could you please
test it? We need to check that this approach worth putting more efforts on
it. Or through it away otherwise.

Could anybody run benchmarks? Feature freeze is soon, but it would be
*very nice* to fit it into 9.6 release cycle, because it greatly improves
scalability on large machines. Without this patch PostgreSQL 9.6 will be
significantly behind competitors like MySQL 5.7.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#75Dilip Kumar
dilipbalaut@gmail.com
In reply to: Alexander Korotkov (#74)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Mar 25, 2016 at 8:09 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Could anybody run benchmarks? Feature freeze is soon, but it would be
*very nice* to fit it into 9.6 release cycle, because it greatly improves
scalability on large machines. Without this patch PostgreSQL 9.6 will be
significantly behind competitors like MySQL 5.7.

I have run the performance and here are the results.. With latest patch I
did not see any regression at lower client count (median of 3 reading).

scale factor 1000 shared buffer 8GB readonly
*Client Base patch*
1 12957 13068
2 24931 25816
4 46311 48767
32 300921 310062
64 387623 493843
128 249635 583513
scale factor 300 shared buffer 8GB readonly
*Client Base patch*
1 14537 14586 --> one thread number looks little less, generally I get
~18000 (will recheck).
2 34703 33929 --> may be run to run variance (once I get time, will
recheck)
4 67744 69069
32 312575 336012
64 213312 539056
128 190139 380122

*Summary:*

Actually with 64 client we have seen ~470,000 TPS with head also, by
revering commit 6150a1b0.
refer this thread: (
/messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com
)

I haven't tested this patch by reverting commit 6150a1b0, so not sure can
this patch give even better performance ?

It also points to the case, what Andres has mentioned in this thread.

/messages/by-id/20160226191158.3vidtk3ktcmhimdu@alap3.anarazel.de

Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#76Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#75)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi, Dilip!

On Fri, Mar 25, 2016 at 8:32 PM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Fri, Mar 25, 2016 at 8:09 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Could anybody run benchmarks? Feature freeze is soon, but it would be
*very nice* to fit it into 9.6 release cycle, because it greatly improves
scalability on large machines. Without this patch PostgreSQL 9.6 will be
significantly behind competitors like MySQL 5.7.

I have run the performance and here are the results.. With latest patch I
did not see any regression at lower client count (median of 3 reading).

scale factor 1000 shared buffer 8GB readonly
*Client Base patch*
1 12957 13068
2 24931 25816
4 46311 48767
32 300921 310062
64 387623 493843
128 249635 583513
scale factor 300 shared buffer 8GB readonly
*Client Base patch*
1 14537 14586 --> one thread number looks little less, generally I get
~18000 (will recheck).
2 34703 33929 --> may be run to run variance (once I get time, will
recheck)
4 67744 69069
32 312575 336012
64 213312 539056
128 190139 380122

*Summary:*

Actually with 64 client we have seen ~470,000 TPS with head also, by
revering commit 6150a1b0.
refer this thread: (
/messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com
)

I haven't tested this patch by reverting commit 6150a1b0, so not sure can
this patch give even better performance ?

It also points to the case, what Andres has mentioned in this thread.

/messages/by-id/20160226191158.3vidtk3ktcmhimdu@alap3.anarazel.de

Thank you very much for testing!
I also got access to 4 x 18 Intel server with 144 threads. I'm going to
post results of tests on this server in next Monday.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#77Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#76)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Mar 26, 2016 at 1:26 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Thank you very much for testing!
I also got access to 4 x 18 Intel server with 144 threads. I'm going to
post results of tests on this server in next Monday.

I've run pgbench tests on this machine: pgbench -s 1000 -c $clients -j 100
-M prepared -T 300.
See results in the table and chart.

clients master v3 v5
1 11671 12507 12679
2 24650 26005 25010
4 49631 48863 49811
8 96790 96441 99946
10 121275 119928 124100
20 243066 243365 246432
30 359616 342241 357310
40 431375 415310 441619
50 489991 489896 500590
60 538057 636473 554069
70 588659 714426 738535
80 405008 923039 902632
90 295443 1181247 1155918
100 258695 1323125 1325019
110 238842 1393767 1410274
120 226018 1432504 1474982
130 215102 1465459 1503241
140 206415 1470454 1505380
150 197850 1475479 1519908
160 190935 1420915 1484868
170 185835 1438965 1453128
180 182519 1416252 1453098

My conclusions are following:
1) We don't observe any regression in v5 in comparison to master.
2) v5 in most of cases slightly outperforms v3.

I'm going to do some code cleanup of v5 in Monday

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

graph.pngimage/png; name=graph.pngDownload
�PNG


IHDR��g�D�	pHYs����X��YIDATx���X���.���*(�A��Z�j��V�J��Vj�Vk�:��Y�������mT�@PPQQ��!w�;��
#a|?O�������&��xO�eYP%Eu��Y����AV�� +@y��<�
Pd(��Y�S^VHz�8�QBK���*���;�/_�h����H[]�����x��C
��^=���	I��9{�RrF��k�N��}�f&�?�0.�����g'u!]��������;WO�jf\&��"��TU��f������P��N�wof�������!��4�����������6�.�w2���zdS�A��������:f���s������N�����eh�=�V���k
����������/8�������7��_��3~�zd����w:8�^�l���V��0��t_��?�FwM�������+x�07~�_c��A���i)�]�7j��-#�N���;}bD�����~�L���u�8�o��}�'����|^)������8[ny��S�o43��������>��V7���jh">p��qG����r���M�����+�i�I���D�n�z3-en������V;�����G��9}-�����#���:r,s��v5�k~��@�\�Ya��������Z���m���v���PB�_���0�qG8�Vt�����O�3
��uv��.8�9&��~G��g���;��Y�gpj��]���e��:q�XDv����'}��:�a���='����H��K������_$
�3	�2�V.^��������M����kikp[$&���U���M2|�d��g�v��<��F3K�{9�\8{�W����X�xMV,������0bFD�\�(�+9��[D,Md��-aaa�������Vt	
�YAI�gFH3�v\P�fiU��tv��TJ�T��%�$d�d������X��D�����<Sc��6+-��XO������8nD{c�&I���X�����;��C�w*��w/��*�M^�%4p���W����y��uqCFv6�G��Z:Y��'Dr��� Bt�[j�5�f�����&����=j5�J�������/�am�`����ck��,sW5�"@�J�
Y�b����p�y�Lz�@����6h��q?�Yw`�W����9�a�A��_	�F
��w��M�L����mO���j�!],f��3n��Ek=���>1�d��Q��� o�u�����:`��-���,��F���|_.�F��������yt�\�����E�	�No����3�M��x�]��(��{�-�	�6�����\�;x�*���p����M��k����m�������NS?b�������+�^
�6���cs���(G�����*u���� 4<q�f6�V��H�{�T�6^�>�����&���~����Za�h���q7�oF>��������&P�y�O$uL�dZ�����R�.2ir�A�����)�v-=��%�/����R�A��n[�&BW����u����{��@�k�w��R6Pyt�[��w�c@y��<�
Pd(��Y����AV�6���V!D �jj�@U������*X��;�t�\��E�
Ub6�$�2C�KU> �(�/�[@�@V�
J������]�a��������4&Y�$�iy+�� ������(}����$RF>U���AV�:�er��6D�Y�Z�2���V���i$;��u����N�JT�OQ4���d����*��&�S�Sd��(Z�5�H^�*s
�.��\���3�#���T����J	
,C��H�#��>��G$/���R:asI^U�FrU�JUP�!+@��e���&��E"N���U�'J����d�*e�z+�y��`+)x�n,��!L�L%�/�����2Dt���L��)�
P�Q
����\W�2����5�l_,C�`=��E�O�@�>��U��Po������?�t�����BC/��G��Y�_����/�4)H&�ql�X��HL!�)�f�[�?s�$_�H�2�8����� ���wp��O��'���#|h��$/�dG��5I���]$O��No~O�V���ib��D	�T� �
P����� V��d?x{A��;�� J��ZK����@�v4�5�A��%="��O���,����|n(�7�$f�%C�7�
����c&��+(�p4e��&;�JCV��*�>�r��^$�!����lI���T��]�@���z�]s%�\Rj���b6��F^!������I��&l#�Pz����jW��Dpa%�)eZ�s2j��<��2}���1
%hU��M~�����$�2�.����}�G2-"��9�L(%B+�oht�4�",\���5J���2��6/Q(������a���7~�F�&���k���^ I�I�?�a�b9��	��5)5+P�e���D��Z�T��<J���D^*{
5b�YIP����B�y���M����v��� 4"���"j_�"����s$��rzB}�k������43�[�0����]����;+P�*���=��d�|�?`�~��,��1��c��R�S�S@5!+@�0�B^.}��NH>e4��\`S#B��2�Q�%:]�V7J���*��9��.6��L�U'(��z�?����/;S.-��
���
�t�f��tiqA�R�%������$��4dG��7�����
�h�
P+(5;V��?N_�B��w?,�=�B���|@�=�w�3���H��P�T�<�v(3�[�S8�1���F�����oiKEqA�R�$�_q6�������~������D*y�@�CV�Zc2�����qi��O-O>hu*u6�	3����W�NY@�v%F�U��
������������}�����dfp6#�������7������	J��h�Z�%i��1$���]#�/6�/"4&��(��D�Rq�5
Yj������b(o%e��J����QjM�Y�
c������G�g}��'e��i;�<��MIVa�� �g.��|>H�SBnN�~�,q��gA
�����%n�#Li?"?�$lg�e��1�7��@.������y�R6#��D����\x	@E������j��J�?�%���~n�����<��������	�v���r�����>���T�~���R�a
������
*�D��P,I����'�b��_���U>1pU��_����7�����
�a�.�#*XI�	��Z�]A}���#��G�/��H	��+�Z@�5�|O���'����T��Z�����K����d�jQ2&y���G����O��$�E2��cT�y���'�C�YQh�"�H�J�����������+
�(-V���2�T\�������~P!+@McY6�7~��wn$�d@����D������������BD�D]�?k���!���5��&����T�9���_z?�_��F3.������s%8����'/7���KC�|e@��p~����$�/�y��Lr�KiPoC��g
F�y[NJ�h|@4�+�R�N'Q��/R����4Z���>CV������I���/M�����������G���r�s�y�P�2(���~�K?��iJ�y/b��O^�����IzE
K�\@)	+�s�;�UT��]�y�$:���Q��k�����C����Y�Juj�%y��2����I���p���-�"I�Iv�[kd�b���&K����7	.���l����{(�Y�3�{w��������w����av��������Z�B�#�CR��2]_q?I�������7��ub���yG<zPC��9@�3�0
P�*�
C�o���K���-IN�F���V�7}��G�l..XX�G�����������M�!Y�=qP|�85����5_a��]�%;J_���r
�`���c���(��$��l$&4�V���K�J��J���������R�aEg3@C��5!�����>�����SFc��������f���k�=$�f�����"�"����6IO-�5�$�|��G�v���R������� 	
C>�ZP�(��z�X�������?����4srs����C{����:���I�~P�_D�)Q���}�f���i�}"]V8�<�|K(�k6��*��R�N�L����@�����2l�6�`��W�iy�_(��]��X<u4���������~���)I����^������glF��?��g�-f7��%A�U�*�����)5��:AA�����h��%?��R��6v�/�|.	��a��O�0��B��D}���?��g�� �-X�Y$�V�p�'����AAB�k���]B
(�����
d���h���$��}�1��2��p|��� ���?�����)�y."e�
 �E#����'�S���B�����|\(�'��$$q��t�W3w[x3(����.����������G�F��}�w8��!���4oU#?��Y�������O��:�F��A�L��"�P!+@��b��f���7�i��J��Q*������t;..���	�?��ghL��O���rv#t)'H����5�U������~?��#�g�ko���/$5Y��^��8��6������u=516��g�b1������/���Z�n-%���M^��l?��G�<����#��BV�e��w'�y��@�4�G�q�[iAA�����M������c��M\\�j��D)+�1_�F�q��m$�p�k�a���>B���7���sA�7�d�3�
G��0.df�����l�y���G���[�f4;�K�k ����g��W��5���0�nR���3v*4:�
 ��}�����;G�t'�K�+�� ��y�A�-z�G�^���;��_�Ai�
����������K����nX��M��cw����^�����X�X�}��K|,���r�1�s'+�Y��_�
�[���*|��y��e�m��L{�9��M���H�a$��l�tH����w�zO������()}#��������{���`�ol�5���;��F6�%Y����I�5w���I8(�A�I�)�G�����t�����Fs��[��F��X�����i~:�e���>A�*u�4�
 E��Y�.$-��U�.P�e���a��a��]���$��{,i���@
����F�Q.���������Oe
L��v��Q�t:z�YK�yKb��j���[L���:�4%	/�is��(�f0��S��"�	��������'T������;�������c%�%��?����� +�L�����������X�$=�=q���y����E�������4�V�����O{�7������Y�� e�i��Z�Hs.��L��Z�a?3y$	8S��'�������,r���;r+��}9��� r=�����LyY!����G	-]�k���~�z���s��{�2�7.���PC���Wcm��U��c�����Q������Cq_������>�K6�u���I]HW�+���+����_-o%����b~��B��
{p7{������z�T��\� I��,J���XaP���
���>������Y��iz��|\�>��
�Rn�]��m�X��D�cC����%a�$.�RE#.4&ee����S���I���v7U}g�������i��Q�7q�ln�P�C7"\\���f�������W�l�9��26���b�\�,��{�������3"�rq�
�g��C�� sMa���d:M��XfS6�]�jS�F��O,Sz�7YNMf��g�����������d�gt�>D(d��������Pz�gL�sv��R����J���~y�8�$�?������]M  �[Q�[�/�Y��
a��.���e���F�r�3}���*��J�
;~����'^.GC�����������7.Zvr���}�o���5�Ut77__?��}����|��z]:n���w����s��:��n�����#��53������(�y����${W�~m4@�BiubE������l��ao���|\��F���.{�2{p{�8��}wc#j�(j�h��4�}J��N�qA�l����q#��Ml�LF}�����u���Okd�����B��M���J��.e%.(�7!ii�/[�V�/�g���X�r�����
}o�x����������$I[=f��{������a�E�#���
y�iu84�����LK�[�=u���U��>�����7GN_�������!����\���]M��_+/@cF�f�\��?�vu���?^���?������H��=��1t�^DP���$.Pjj
�x7.H����U
�'���]��[��Aa�	��i�B�RzV?������������a�OF�?1�H���q�C��������i|FAx!��NE�Bg7�����/c�q��^�m'�6��N����������^��XKb���#�?�t,��y�&��Q����O�{_9���XI\06SxP��4z_Rq\@Ph�d�"����������c�|��������Db�]r,@E��q�"Q�I����r�"UM�����>e����kikp[$&���U���CDDDX�{�O%d���2���M�;���p�<~�N�,�fP`h��]��N�^�:�M.\�g�
��U��}��~���f���.�w���
q:�'�$������3jZ��������J��P��g����o_2��f��:*�����~���RN�a����Ur���s!�E��U��-��i����7F/(WA*��������&#��	J�x[�pBm��f/�o�����)����iG-��M����ul��
�6�B�����n���x����_p��������$.�&!C$kf��p���z�&����%��K��Yi)&�z:�����q#�K6IJ�&��POO��0@@
��e�F�g����G60��e������+k�(��.�]��Mm�m{zh�d��Mx����:����������L�����������|B$W0^"D����A^3n608�y������=j5�J�������/�am�`����ck���*kW5�R4Rl���O �45Q�]���
cSBH+g����^ �����.������.fIZ?Dk7��/���[�=�������c�#���W�<z������/�4(!*`��6��>m��zC�X�X9g�@���z�5�}�|1o�(Z�b����:�Ccng���Sw}�T��F���|^&���M�@~I�L~:����W�J��`b!�~���;�
4x�e��m4�����45��-zMxvz��k�S�HwWv����{N.��}�cjT������e��B����t��mcme�)���1w�Xwn��[/�t�����]xd����e~c��T
������I���%K:�8(�@Ph�J�
B��nf�}z��H�������4�hsZ��J���7#j�Yv���R���������d2-�wlco)i�4�� �r���K���m�T���l�R�r?��r��������Y�Id�d������6k���-M��L������xM��I���J��*"��}K�@��@L3�$#,a/�������!i���5=�;"PbL-���42)l��&�,�-���,26��m�]�1�v���=C}�A��CVhLr���F�D������\r�$��/�p=��}su�E��u�@c�f�"QcI�+�P.{2��Uj�bz�xEu�@����'�d�3��9���.�9�+�Qv��v
PY��c~#�~`�����H��>����t(�@������I�F��\r���t��E��n}W��
���y�e�|����+S��;������\����	����[����K�z
Y�~����/V����V���4�D����{5��fO��sI�u�JJ�������=Z���P��}��Z(

|K*���������'XXS�l���WT�PO!+�{���������� �����Y��Pz�*"�T@}P�!+�9��
�[A�M�"j�/��0T�@=��+���w�����6��X44�
���ur����oH�2���
~�C�E@���P��:3sa��}w�*��T�|�"J�Y���G�0b1����Zu�����'�u�hj+�4h�����f�����5Y��co]g�o(����������d�z.;���%
��Q{r>�4o��5Y�~��Ob��3������P��
PS��1��E��tF��g�Dh5��@E�
�@������"�T�"���E�
�@}%^8����LS�Th��|d�i�
�{�0��O��MME�[+�"h���6�%��7�����l�ShE��!+�7,���LR�%s�e��"4TlQ��!+�3�;���[�jO�(�"h�����C����eBMT%V�	������CV�?�b��I$;K2G
W��yR:)�(h���
v�?�"�i)�>R�,TdA�8 +�l�mf�R��E}�J�>#�v
-
d�� 7��n"����Qc�)C}��k���@= ^��DEHg:)w!1�#�
�Y��cC���6Jgt(�3e��@~����AV��2���}	�H�("�H��(�.h<��4��������nB����KiuThQ�� +�]�����^��1E�T&�
e1���j�@��~���.��	5A��P�d"Qi�����AV�������W�i����S"B�t�b��F������q����.��U������s�O?�BK��O��:B�&�����j�����a��^�Ir|��s��3
];urv(�+739����q�f��=<;��*w�������Kg,ij�
?a�=����
�6����;WO�jf\&��"��TU������F~�HZ8:�����x���!��4�����������6�.�w2�6�zdS�A��������:f���s�|��$���gD,��lv��U��+A���*t�����2KgIg����J�h�P��(�.h�J�
;~����'^.GC����3�ob�z����NV�1���
_��2��+Nn��}�/���Y�������������O�?u���S��K�������5s��C'{���>1"��xdP�f&����u�7o����d��^N/�|0�dF�d��BYp�R��O��bK������8[ny��S�%�y��?����
������]g]y�A����c��g����k6]�;�������&Q�C��}����{�zO���h����O��}������rA�ko��=r���#�2W~!kW�����K W�oI��Ls��*�wf0�h�Q`U�����O�/���>���Z~K��e'F��8x�&#6"�����a�;w!d������(BD������n��	/�_��������Nvm����/sW����6�9ej�FG0kJ�T	5A��� Q��)�:h��xDAz����Q����F
2��v���U��DA|&!VF����T5�"�SS��isS��v-m
n���TY�*��P�1���VP������\I#5J�2(<�`:�
Y4nU��Mzz����������]��H~&���-9���������c����4!����s!�E��D������{��}��.hp���������Vg��?����w����F���$���}d�:_9���Qa���$.�&!C$�d��p���z�&����%��K��Yi)&�z:�����q#�K6IJ�&��POO����
�S3���e�M����P��E���i�X1�cOy�P���`m�-+$F��>@���.��T-�,�G���w�.���R� �7�<�7��<�x�Q��VjF�����|�k[�{��u[{]e����@1����/��S...�S�](P���BVF��%iq9�t7��/���2�?M�#O���s�����J�j�����x���&
J�
X�����O[c5���.3V�7�����`�l��L2o�(Z�b����:�Ccng���Sw}�T��d�J�/@
*+(p�
�y>�K���Q�E���������yt�\���IU�&D����#n���N���^�/��6�����\�;x�*���p����M��k����m�������NS?b�������+�^
�6���cs���(G����p�U�
��a��������/���\0|(^iYAhx���l��6�������P���	���C�h���q7�oF>��������&P�y�O$uL�dZ�����R�.2ir�A�����)�v-=��%�/��@�C53eu(������h��"�'��.�R�zB��u�R�n��\NW��<���D�&}+e�G���l {W��(����,�~7.H�C}�Ji�%����(��	���Xl�g���BqP��������Z"���Pu��+"^L�	���V��=
vjDo0Qi�������<���=�����0(47#�d���PY@1(J�D
#�+��j5U�j�L(1�A���@VP��y��E�k���+[��D�l�B�x��b��I��i�����(�V)&�D����������7F��\��@��3(.+�%�
�����8 ���)��!��lb�GhU��>d`���D�4%��@hB�(��
�
�T�

��'�R!T�(�1��/��c���ML �g%�T{%�FJ��
�?Ula�BV�7����h����;��S6�X@9���=Rt�����(���,�\�
r��	%��i�C��/JhL�=[@9���)V�K	]
@��)�P�!+�Qn���t�A@�sy�P�Y@E���=���J���*�����K����0�g5��}��
��2�����%��$���Q&D�I��UhQCV���~�0���
z����*Y@.X�=Zt�)E5��
Po +�{�*y�X2MI.�Ts mY@� +�s�h�MH'��
8��d����A��N�P:��)�A
�	���j{�(���LK@��"J���	���j]��
"�j��OG���
��}���]�LS�������Vg�VPy�
���5�Jg$ ���P? +�&F��$aES�\D�)}�,
�d�Z�^�D��K��.���iu"*��	@F�
��dX�5w���@maSS����'%��@���RhQ2CV�5�Irs%�Te�?������d��P[J�U���V�7�2�����AV���{�n�d����H����P� +���;Ps:�����~BV�������4eJZ������
Y���g��W�iJ�SA�h)�&��*/+$�|�(��K{m���hs3���}�lf�������h�{�r��{5t�zz�0�V/�$9>���K����:9;���@�R2��2���apV#�_ee����S���I���v7U��&?	���K�8[����i�!�J�����M��q�F�������L]�c��;�s�\=��� _���R����s����9��
�Na_�"�g%�T{%�Fk�����������G��N��r$0�������#2L�G�kf}m�[�Q~���[=���9�������5�k���.nn��~^�����S'�;��t�LKi�J�Q3�n1t��]�ue/��
�r�����Lg����s�A�%TG�Y�5p<�����#�B����9r�Z����m��=r���#�2W~q��1���3Gw��5
�.���oW��L�����d���fZ��3�Sg�/Zu����:{�TW�����k ���E9P��%������)s�����o6f���#���{qK'�6��N������8�S�p�����O�3
��uv��.8�9&�������(�n���LS������Dh����C�� 2S��gsS��-m
B�S��v��6b��DA|&!VF�%���j�EF'��\W����p�NJz�2��
P���9+.|.9���[D,M��l��XV���.(�-#fD4]�]�%""",,�������~#@��8��4�6�)*�@t��W,���c2d��������;T�������zz��$.�&!C$��i)�������&!�qIy����O6+-��XO���z�p(�N�\P�����o
P5�����,�4�����j1�S�1
-
������&CVP3��$���KdX������:������N�����	�\�x1 �����y�����h�-�
r�_����h����^��+�Y����4�g���t1K��r��<n&=_ TQW������������Y�_���]�E3�(�Q�G��]�p���%D,��f������@oH�+���q�ZO�f�OL&�7f�nQS]���(��9���d���D�	�p!�M[@����_��4�.�����{��5����Wl��mH{��v��Q��#����W�M���='����15*\h�xj�2�Z���m[��������D�����i�������c����t�����!(-+
O�������"���E&�Cn?��06�����{���CZ��J���7#j�Yv���R���������d2-�wlco)i����e����T@h
J�c��P3J=!l��m���<���(m�c;�����I����ES�](v�<}$����H�>���"k�!����,�&��j�Y���2��c��VJ�"�vD�N�5�d��b���B!��fQ
�@u�� �('%B	)�!
��&!+T��	�*��������{��b��A�
���G��;���qV#4(�
��0����iK������huUdI5
Y���s��
KH\�d��\���PB	Y@MCV�
��1���%G���:
����� +�����qP��t#*��)44�
�)%(p�3l:KY�fQ�!+�����y��������(
�v!+T{�\�AA����v� z\d��b�w�8V�0�	�E��)��j�@e��b�_ grKYfM��,5����*��j�@eQjM��F2d/9���IP�B��G(eUP[�da�
��s.�0E-��0�
��N���j��(�&����9*��'��l�i�l&Fl�	Y@���K�iUB^j�
�Y�0����"�5�
2`�"i)�ij�
��);����\���j�@�����7H��(�����&��(�2�Z��PY�_�IB�d��]�&� ��^���j�@�0�c�tZDQ����t�������j�@������h�4����PD��d?�,�Ula�
Y�R����)B�����'H�t��U�U��@��k�����i�C!%�H���@�|�E����@��m��)!^�Do������Y������%�4��D��&?(�"�BV��u�t�"��21��_*	��r���2�wc��*)G�����I^�7���GV(M.���8([{��G�@�����	���[a�����7�����PY@Som�0bF �E�������Y�e����4��,|��^\�Gh%n�����Pf�Z�H�)I�S&MJ�����bR"���������u�?�1"��n���8������E�O��k���r������������t���~Z7{<���d��;u��?�%�d�t
�~2~�����LZ��������O{/
u6:����s����p?��Q�l���DSx����&�;q����L���MK*_6�@���8��~�Ls�j����PhEPc�
�����\�S�:���,$��H�z�x��Y?o����b���&
��������/�3��U�/���5g���?Y��L�N���������U�i�.��d��������������t�e_~u����f����������
?������t|��/��f��k������������r�ss7�~Md�2�w���LS�u�&�[��������W��������?+�cI���v��O������\����[�ul����?�<�L�G�g����=�'����l���*��-���q�+�e�'��j4m:��[/U#�&&��v�����:u"D�������u�E�z�.|�������	���o�������
eHKa��N[�T��D�L��@Caih#�Pp�������JYd@�1��������C?���/.���'Drr��>�m8�`�d��n:��na��v���O��R��I�$�O�!$#�Q�Q��
vv�B��
�c�n!��ijp3�t�b����zg>?y����g�T�����T)���$Y��g���-�]�~y��]���������l[���ATMU�t�(�o������=��D�U�+DV(����o�t����,�"�B+���,��mr��u��P����`��Y�:oJ{v�lD�W��������zz-�d!�k�|�2�B�����<;���]�$�D(����m�/[.� ,�R��xd����H�S$���>�n7��5����mm��I�{x�@�_l����w����w�n"=�5!y#�����>�M�,��71*G&��r���,��S��z�<���-G|'~������x��;���NI�
�b�R����m%�{�"����%���X2�����E�b��D���g��|B�lHxx����>!������X��is����ir�sK�wn���v-�wl��iA�f�%��1`����\�5i��V���;���s7��C����I��^X���������0j����/jxdF%�C7C��>����:/t�tC��-����b5�	-:p��?g�~�O�����}v���w����7��9������l����]<���-}�n��1$9�n��LYGt��aW���0NEM�C�.6�:�:z����maT��e� ���S�����ut��p�����x����d3[�����EB1�7.���PC���Wcm�����c�����Q������C�RU�
�����];�3*��Y
-J���s������q�2���S���>R�@]������iM=S�����,���a����)Bg���o��i����jU����b�h�
��ZZ)������O@�_&��'!�<<#b)g���{6�:]	2���IC=<��ppqy��ks�BP's���G6��K�Y
�{�q�c]�=��VU�
���g�g��ij�DK_��(�Y� ��$�
�G/��������m3b������h�����#��53������(�y����tr���}�o���5�Ut77__?��}����|��z]:n���w����s��:��N��j�����M>���$�QV�?���j��BVbB&!N}zI>�[��kL�e'&�39zs�����;����{��]G�e�����c��g����k6]�;�������&Q�C��}����������_�����'u������5��@c�d��?��s��O>%U9�����DM�7�''�m����������	ql�:�el!.�����d�f������qG8���;w!d������(���NV�Bg7����U�
Yj
��9#��������!+�jf����:V��������>[>eX�W7"������kjis������	��^���]E����"Q�I���v��6��j����)I�vEH�i!�����������wo�Sh�4��6P��;�4o{-�*!W[@] CV`s?��H����)�-�D��o8�{���H�^K����W��X��Wq�%�W�b~Mu���Y���/��v�P��F.(x{{W�7�F�}0�Y�M�.<�]�o��J�@e��`m�!+�=������W��s��f{���ic�|�������;��F��~�OJ�&��PO�T��%�$Dz�GfZ
�li�gj�IHx\R���d����bb��c`,kW�}�������E�o�d���N����s��gG�����4-��	�fd�e���/�am�Wn_����UV�t�<�O��t��A��4��6�k��G;h�o�����G�F[U���x�qc������E��g�B�[d�
�f��uK�l�c�Te�<������hu�A�����8��u��5[N��Y4S�R5x�N��7
_0iPBT��m�=}��	��t���r�����kf��d�ycF���vUk�4l�z��)	(S�M;����M��U�������4���:��_
�n��u����EWl��mH{��v��Q��#������M���='����15*\h�xj�2�Z���m[��������D�����i��EU��67��ob�����&�T�2�7�eC.������������{'��������lw��U2��(�����S"��f���������r�i�"��!�_x�bi�����t�gZ��J���7#j�Yv���R���������d2-�wlco)i�BWU�t>��!��g*[{�[�u{�3i��c^�������S��~���[�qv�Z��s{
�]`���=k��T����0�vf���|G*"��}K�u�����{��@�k�w�{�ZW�+�*`S����?�$Sz��3�P�\��{�"3i8)�V���3���?��y+Zx ���v6��Q��Sf�7�g9���LBc���g?�|�����@L��~CZ����]�����3?}C

�����M
S��)#bi[�S����h?����$�R�_r�����[\�wZ�9�
\���]�^]B7D$�s�r:�dh����$�	s��$�T;�(��@c���2�zW}����Ge�������h��.��o����9�"���?����N�������x����������nT7�AV��/�>�}�N+� qC����
�����r.
���
�~�������.��:��q�>�\���.���e���-\]�����|wAV����#����/
- /�45j"QS�c]�Y�
�i���������������U�ml���E%��_A���v���_��u&?�
��������.aO](��A{O�_IP(���;]���O���$7G:ojA2�IK'��[������w��EC�
���hT��S�'"�1C]���=�{�j�����0v�/�R:�;dh�(����;I�������h���1D��4.����[e�8�1M��qHN6X��vl:HU�g���V������<�5�gs5���gN�Z��G�|*Y��d����y�w%7�
�������icN���$��N�wIP��m]�]������4��]��G��N����/�x/��_x%�@�y~��+�:��t��J}�`�VBtl���}����4]Y�IZ@I�#1	�����7���+�2�?����������V9(R3b��i�D6}�w�������F��z9�X�d?y�I��kog�Z3�W�4f~$-���	'f�w*P�3]��@�A�v!+wT��q?�z�����f����O�_
�z���&�����i]��@����DmY��H�fI6y����[b�L��@�#�9;�{����zGO��
)�"�
�hd>(�0o����BV�F�M���:�NP .�T�v
���@V�F #��4����m�u��7���Y5��
��e���~L"�JYT��~1��zq��
����L&�n��X]����NS�]�\�qCV�����^<N�;��32�/�����o!���dh��J�(�( ����`b&�~���)�8��Y26j?��]��yw�@�!+@���^a�~K���?&+E�
 ��W�)�NJ����a�S_>��$��{V*k:5�����Y&v�Vf����h,g=Cj�."���\b�A*�2���')H)s
����������7u����^_<:���=�dhprs�_�G���4�l>GL��If�!v�J���
P!�VaM��g?��L�9���L&����L%����0c�tW��P�Fz��
���q����$�fISg%z�|IP��n]�`�Z�jj��N���)!��>$�n�=(Uk~�I��.�������,I����#g�hI"���T���
,k
�4�� ���IR�t^@(o��e��B����esI�'U�8��� ����l0��^�O��m]������/slE|(�c����D���.��OT5�;����;|�to��U�� +@������RP �����T){1�#���N������}�����B��c��3��L��G?~�����]Y������1�sE�+��P��r������o���TU�K���^�U���=-�:xr��o�9�����n}F(v�8c���ZAN���]�����1���h(�d��o����W�H����n:������,�:I��Q*���J���G�$?�?w���D���n*4,��EC�
���hT��S�'"�1C��{����O������_��v��yBAV�&68��fI~-�*S_���Jg5�(OE�uE�D�}��g��!O�%BcR�D5�K�*^�0|��	+~�c������M���s2��^N�u����]�7��� +@}��\������o62���P�
$E�e>[a�@�c0���!y��|V��G��N����/�x/��?�o��uS}��_�t�Ss�����k�N��-�j���AV�z��������t>;K<w
9u�x�������>%��K�t{n	5��(~����(����Iy��!{�����A����b������p��jj�$:��X������\@��u��9�Y?*?��}�4�acc���\�L�O�^J�Br������3je0����+T���C=��t������v�������/�����/�C��!+@�&	
��c�>1L��Y1��&K+�P�V��F�k�\$�������Z3�
�����Ez��z�4*�iy��[��k��]�A�x�������f��=t��
,C^,��S���L9�
��!+@�NPx��#�z7��^��&#Y���G�	� +@]T~P�=,XS��V���N���Ty��h +@]��/3(�tdv�@������c���'�E&�;�@�AV����?�������Yq��%�/����"/�H)�R�5sX(��u��W!�/?���0(���~'4�5��;H~�t��"��R��Y�(z��_�9���A�����K�f��X��	���������>x���Y��]����$r3���}�lf��������n����x��C
��^=�����J��9{�RrF��k�N�ET�+hx�?� (p��m$����&����h�d�
�����x�F�:�m~��H+���o���OB:yxF�R.���a�lZu�d�)ds��zx�����444S������N���W�l�9��26���b�\�,��{���w
��2W:#���R�B�K�j�t�(�������-+,�>>���H���NM�����/����bR���'Fd�����$��^�����m��z��
s�O����3kt��W�]��|}����	s��N�5v�t�������~�f��2b�d/;�����e��"��9�����o\��[xq�	�#��o�RWNh`d�
LV����cfn��7�r�����45���G��9}-����7u�9���c�+�8p��qG����r���M�����+�i�I���D�n�z3-�V]�Sg�/Zu����:{��UG�:qoo�A�����H�)wz�D>"|��[��<"�Hg4�P�^e�����1�n��p�FK���*2,������<;26U�����o[h�g���#���{�������385?#6"�����{�����i|FAx!��NV�Bg7����U�
Y��y�����x�[��=_A��$e�=�$��B~n.�<�W{G�O��08�s����'�B]R��vsS��5����������Db�m_��"��F�(��$��H������j����)2wE����5�U�}�>�X���V��|\:��A�:��@�FH�����=��l��]~�����ii�y��
;':./���1!"�&,7��|�c�\;Q���3"�����]EDD��������{+��������Zt�!��iMR�?\W��f"V2}�v���������2d]S���O�H>��l<zw����K�AWn������_����	�2��3�'qI7	"i�LK��-��LM4		�K�35��Z`��RL��t�e���"
���}x{{W�7���>��N:��f���w�;����3�}��}��:o�|��
4�<�5���&CV����!$�~4!��
�I9D��YZr��%2�ma;{��u[{]eUK'������H�V��u��R� �7�<��A�����mU��j�e����n�7�=-����Sy��YA��ip�Mk�|��{g;�~[}�f��PwZ�z����:�Ccng���Sw}�T��F
��w��M�L����mO���j�!],f��3n��Ek=���>1�d��Q����]��K�T�����|�dc�JT?�Oy��L�+(���`h�~]���>0�z��u��i�r��z)����6����#�]G,���w����{N.��}�cjT������e��B����t��mcme�)���1w�Xw��t����>,Z_�����X.��T(�o*��eO���:'��b��v������O&f�������Txr���y������X���po#����2�F����Zz�]==����'5�����n�)�L����[J����o�}�*}o��0��/�1K�M������o<��Fkh�d��PM���RNSRyt��Q��pl��=�_ �5�;pX�t���GX���r"��m�r3�D��_r����O�|T�4B��$(R�G(B�&_������R&Z]�
7���������nU���S��������-���R@�M��N��7�9�f�PI0������j�r��Y�=���k(Z��L!��J���.D��d�A���O:�a��U4�i��_|dc�������q�JD�i�%��}�k�h��@1����Hg������O#��`��c�/�zv��0��)n�(jH���&�jfmT�O������mPN���n~3��e����h��@�=��CY�>��q�����-�8R��vf��|��>�_�M0,���SGV����FY��������A�#�f?��j�hY����X�a���'��]��q��6U(��CVy+����!�d�{��l�`Ro�R���O_��;(R���0��;yi��*t��!+�|��!�_6v��Y��
R��,����f_��eq���S/�='op�Wo��
����9d��2�>dG��&)�*������_��I���/7����a�)�N-���
�
 ?�}�O`�� ���o� wV�������M��~���0�>.�/�.CVyy����1�by��0Y�k%���>�Z�u�����c!��9���Mg�~�VY����@V9y��qnO�Tc�x�����R�V�s��4+W����f�������t������M�{��@V���C��R��	�K�>(S���C���d��gv\i����s�y������3�������cW���B*����f����^7���X���M2��#PC�)�T���$�_N9�v�6��.eCy�TW�n|�oV������7`�)��:Yj%4`
���-�/b������i������W�O;�/�:_�VJh�����=�������2a���I����]�4������k������n��8���P��f)�"B�=Z@�XUR�CGR>�Q	��N����u���v�M�����#�Q��r.^EYi���������3s�n�����r��nBV����!N�/+<���q2�`���������{��zwl^�5�g����=w�����^��igKS]��P +��(����C��l��d�����>�b
��O_���Nn:7_������WtQ���r��7g=�������E��K����[tlcs��cn�'o|=�k[cE�`�
 o�5�RB�:ySG��M����Fn�a�6�}h���.
@���F�����Y�����-e�s��y8�zur8u%��>|��[++E�H�
Pm��'s�����,g-]6'�Rmog$���l��>g�E2A��u'/l�R�(�T�8�}��$l'l�T�����(U9�UM��L��v���0n:(���+�;9(�(�AV�*b��"���x���A��?��Z���I��`�:�����������z��S�:6���h��@v����9$�
7Y ���c��D��a[���Zs}�An���M�~������{;+�(�@VY�3��5$~+a�����F������VtY�b�����(����Mg�h#T����1BV��b�O�g?�<~X�)j��="��S��	c=���v�y�9n�Q�����'
����Y��V�I�C��$-���S�/4[�w����?}6������Di�.�Nz s��i���Ux��z��[������M/�~~���5�p�jht�56#�D�x=J(9�p���C�h��+�X����c��5D�����{���X��D��>����87�:}��A�����P5�
�%ra����Ke-'�
ar���<Em��m�x��C{�����NJ�?�#��~�
0qh'."<{��M����������]�\!+4jy���A�c�Z�H3��y���?�l��[��b��,��>� �nbE�>D�h+�jBUY��	='�t��N��Y�����+�(�BVh�2���/�K�[�7�	�S�c~�.�Ku�K���h6�� ����n��"�#I�z����]�����&d�A&�?�j[�5QR�f,��N!�`�"���W�����������l�7L�E��T�!��(����(>�Q+w��2�������C+��<�����fL�f����5Y*6�����P�����I����(�(��(j��>=&��M�v��S������d(���@�������4��,z�7������=����v��>�����������j
�B�&N+��� P�����[�������Z4����+�W��M���@�����!m+\�rq�W� F��_S�����x������f��N��r�@�)6#�D�OK��LV�w'��4RU�`sS����3�����
���x����d3[�����E�����o�{��c�����c�%���=w)9����S'g��c�U�
d�&�Ebf\}�9fGE#kj�,����U�3`���}��b�������&?	���K�8[����i�!�J�����M��q�F�������L]�c��;�s�\=��� _���R����s����9�*u2a��1�~Yv�����
��FGHO#d2�F�*Y!&h/���DP����>1"��xdP�f&����u�7o����Nn��}�/���Y�������������O�?u���S��K�������5s��C'{���U��
��>�.����;>�t��2[\��vQP���������w6C#r��E������#���>������#���:r,s��3��=stW�]���"����v�<�4�:�H��[o�����{�l�E���}RgoY��h�QF��6q=����>��^��7��re����H����Jm���.�C������a�����$='<�el!.�%G�;��Y�gpj~FlD\���v��]Y�4>� <�Qg���

��^V�+d�J�}�1���jk.|Xx�OS�^/�6���R�GP�"�e������qz������OE�LI���Mu�W���>��S���}q���?��"Q�I����r�"UM����T��"��}�aaa�W�w�^�~��DO�������Z�=�+nl-H��u�>�uqK�������W����B.�����9y�x_��i�i�P��
�b�O�s�p�Pg��*��������d,?�o�t�
l��s��_S]P��t����V�+�B�4ro������M6������Ni9�S;(�L�-H8��HW����3��_��{��a�Sj����G�=���Y�:��/
@���k�Ya�7_��I������3\rx�����{��y{�����q#�K��'%gbe��g�O�����!�������X��D�����<Sc��6+-��XO�@����
4d�O�M_qj�U��C5����^��-TK'j�N����|��XE�r��CPL���01���z�7_��$����������kI����������+������*�Z:Y��'D���b@!:�-�
��q�����Z��<�x�Q��VjF��vUC�C����������_��ue��<o��P�B��D���K	f��k�
���g����*j�Ya����%s������u���� o�u�����:`��-���,��F)�<z������/�4(!*`��6��>m��zC�X�X9g�@���z�5�}b2��1�huY���W�`���n[�����|-I����A;v���4�HW�������SX�
������?�����1U���gH���z)����6����#�]G,���w����{N.��}�cjT������e��B����t��mcme�)���1w�Xw��t������4vS%V��<MD����b#�Pm\�;)s�&Q6���a�_�'.<�^d�������w�w�������W���+6Qd�<�����cS,�Zz��JN:�E+����|3����eWO-5�v�{}� �[@@`J&��}�6��U����O+�*�2����rV7#���;��z>Qj��^-������=�M����������EE�P��(����HE����Gi?����x�H����a�l {W��F[B�T����,�����)QA[��y#��U��5Z����EB��CW�����������&����pO���"���P��b��G%[�u�Vl���M���v���i]��M�J����V..��]@� +4@n�I�yz-��EQ�����#J���]�g�w�/7_���>�}����[��)W�)@����13>XX4�����W�K/�V��������gC��xn6,"v�������DW�-@�����Q�n����L��4.z��'��t����Tn������`����"d����kz��D�b�4�=����R3r��='o�i-��G�uT>H"m�Eh���P��L����m�y�}7���hj����N��@f�
�PA~+����
��]��?�3w/����~��QS}�O��Rt]�AV���7�x�"�!���b%%���D����@��l�����/n�K���k�aB''[E� d�z��������*�z��b���P�L����������s�
>�zg��)���������'���~�37_:�����W�X��(�=���Dw��C�L�������tJzv��������FP? +���glKH���/x8A���)j��	I��Gq���)\\�6Y[7J�zY�~��Q�m
�,�� N������Jrs�>e���_>�����^p��������OPQ���P��}�~�n��S���iK6���s�DL�t�G����M��Ck�:NKC��Z����y����=���}�����J��@Q���m��9x_2-b���5f���6��0��-i��1kB����P��h�������)�����G���R���=�l;���{��@V��.]�:me !��h������I"�Z���tKg�:���
�d`omtd���_n���O=���m�Q��y7V��4��!+�iQ����`�X �]�{���iR[��][T�[k��>�~�IEP)�
uWr��OflO��������
��O�M;�����XG�%T�B���4r��/�g�u����B�E��?��[4*�
u�;c�����C)�2����+��Q���n�-
d���e�m]��t�dN��=�wNOU@m<@�tTli O�_:~`��*�����6~�=��-�������L9������7.N�W8;X|�����Nz���./d�����?F-�3B����k������t�.�-","�{�����K�1�����\I@+�(ht���WO�}2�RZ��d�� |��cj������-+/_|���a�/����<6��l��)gs��HP
�
uEn��a�<N�������rr�Z��v�Tla�(��������	��-��3V�	��������VO[�vT\�OWU�Fv�*CV����/��J��d����\��C{tSlaPK*3~��)}���p��o'n����_�Hzlb5l���M����r��TXu+�FY�����%{.K��>�s����f��#�����L�2_���{s����}���;q#�������&j�Rdya_�$/V��'�L�1��g>����Q���`?��@��%t4�&v��c������	�Z�� 'n8zn\���c�{�XJw�-���>jfm�����.���ql�6 +�%�p���{�-�Mw�#�r�^{�����n9W��'���E[����pi��HOT��\oF�?T��q�$@c��P�,Z�K����*�>��T���U�Y!=+�{�i�����v;�&�-��ka�=�i�����l�?�W�fg���v�P� +�!J=?Vt	�����r����Jfm��\�r����bk��	Y���0���O�i�aJ�m��E����-"W��(��Y�:8�P/ +�	�"��u��[h�	r(������
{�����/B#��E<��,:���G*8AHn^AMUei���erM��
YA^t+��K9���h`�T����mf^�
V������j[�!iLI���
�xv�{��y�T��u��C�����6�z�����B����6�-��I�����R��>�T���_44�V�������B��t4����q�lbJ&����Y��|�?������5��m���`k�om��M��[��3Y��������F/�������
�������{T�������f��kak|�Q������&=~���UZ��KVFlB
�����MLt,�t,�������rM�6���d��{���e%�6U��r�����@��(^q�T
���e����Kfs���H���x�����'�%��&%�gW��$d��V�h
��j�&:M�����N�� 3��B�����_$�;89}��A����f&�?�0.�����g'ua�&�����j�����a�]r�>9>���K����:9;mP���4&����KO7*2O��*�!�����,�,EmM�7gU�JvV�v�'L�)%=��W�*����U���oJ����V�'VM������:��17��[=�G��pd�O�&�'"MAD\��S���g����'!�<<#b)g���{6�:]	2���IC=<��ppqy��ks�BP'����#�z����,����8�1�.��#��&d���{}s����LV�R���0����-	�T:�Q�UrFY+8�_%�q*��	Mu���\����	)q��bY*-�~��*�J&z�F�"#}M#=���&?[8�5�k�j�������hd�
�	�M�8������o��P:�a��)K��
Z�e���'Fd�����$��^�����m��z��
s�O����3kt��W�]��|}����	s��N�5v�t�������~�f��2b�d/;������&���{Q��,S�l���s�\�%�Cn�m�Z�\V�1�������c�S�����	����%�Y��[Z�>s�
��L~R��
J��E�|;��������
$CV��e>���nc�����<2��b����8q��#Go�����t����{��������_8z������]�vM���|G����4�$�ph"Y�o���������V;������v��R��_��f�|������Jf��
w��C��w+Ehs#m�����;�
���Wi�[\#?��r��i�/F���&4�Tk����;���}���rV07�nnm\�
��!+h6q^���x�����X��e���#���{��Nvm������q�C�
�;w!d������(BD������n��	/��U��
y/�mY�Cz��->��-C}�PGTr�F%����ne������Q�H��_>ge�U���G��exk�+KBCz�.vxM�R�j��K���������`m�;����A�U�� �\���6O����a������g-m��;<115!��k��������/���Xi�\�������$kW�����������w�M�����o�L�E�����9���F��.�uz|RF|at�?m>S�����8+�ql�F����IO[]�{h��I��-��kWU���h>������)��~���p5i��gz����M[�]��_K�d�II%���E������%��a�����R�P����P��"""��������2���b+�k�����G;Sr_���F������6�;����nZVAZV>���#f*�Uw9.�D������*��O��u���@�T�A�����E*�����_�
��5������1�9+�������{N8uh���?n'2����y7���+~Rr6!V�zz��$.�&!C$��i)�������&!�qIy���]lVZ������]�S�C�w�?eoooY�*�����d��L����z[v�������|��f��T����t�wN��X��N�LH.9��?
g��W��X��&/�RgQL���F\�$�TW�������"
U-
������H]YS��%����r�������2/Wl|��[�����������|w-�Jm�-+\9�����\���\��^�������/�am�+������*�Z:Y�����\��A��4��6����G;h�o�����G�F[U����5�����P�S�K-��+�@P���h�
L�R�N�����v�iY�)YIiY�Y���'�@�u�=����7��f���(��(�)UU���2��M+��-J�tn^u/d�C�k&���7���g'O��H���+)���[���_�h��������r�����j�����w�.^�i��I����o�����XM�7�����s�
��h�'X3�'&��3�V����Zz]�(���m?��.
:T��_&�����3'OK?�sCr��p"�{��������$����������a�!+��{��LB��������#�Yg�.]�b���nC��86���rt��o�������sr�����S������6-��bu7o���������� *.~��Mc�-�%2wUw0�!�����Z2Gv�L/;gG��oQ�.
���XO���(v����������Y9�YyY�iY�9�s���^�(+'����e�!+��;-���w��Q�6��E&�Cn?��06�����{��-Zy%d�������,�zzh�I/Mn���I�S2���;�����W��:"�����D9�4���������-	�q:T���a���gJy�L��3��3�-''?;� '/?'��.���p��yU�7402du]cg�2/PTRyt�[�EKD����x�H����a5��������il��dvH3M�����:x����+�K�k++.+l�~H�������]TC�A���L�6��7�^~*M��&�u�DJ�n���:�w���.�<�
���r��e��K��������&�-
�3C���Kaj��=O��KY+�����XL�����
>5k�>��������}����?���}�e��v_e����`��3W�W�.��������#Vk������E��Z4v5��BCM���
��Pu�/�_��E�t��MU�^4Y�%�8d�*b3�O[t��������VL"�*
-
��!+�+�>I/�$ �`��S;�[K�H��ec�-,�Y�| +�G����ZK��D���=�����"yCV���]<?��*j�BuM�y��K�E�
��Z��u	P���<�
o�{I��+��:Y�-�/��/<�����n`��4��JP,d��x|O*��D
��vj�����]@-BV�� +@y��<�
2�����?B�U�	����~�]�� +�>@c���AV�� +�x����K�s]V�����yL�<K��|J]������S�hH��k��5���f���Z�s��jtY����e/�'j��I��212�^"����F�*���c�M�]@���P�&m]@]���AV�� +���/�j�^�U�	
?+(���=��f�7��X�V5�M��
U�o��
�
d(O��
)/.��[�U�{
6+����T6+L3���yI���PkE�?
6+�d�u��P�����c�����Q������C���)���DP�^
9�g�W�l�9��26���b�\�,��{����"(��^}F��O��k��3��q3-��+�F���e���^��������@���>e�G��&�u���i)s��Sg�/Zu��������2����h��SV�
�"D���J:/4pvsLxY�����&�)1���
�r�k�L��>e���LB�����[T5�"���kpA�{FP(_}�
�c���~�J������0n")=��m[Yk��I�9�����wo�	P����`j�IHx\R���d����bb�����P��x�2��m���fnf�eoo��,�<\@Q�O/K������PU����H]-U���j[}�
M��q�����Zpl�����mU�����v�jd�W�����`�:�����s�
��h�'X3�'&��3����T�y7VH�%Y���Dl:P59�P����@����m�������NS?b����\�P�j��6�h��UV �y�O$uL�dZ�����R�4p�,+pD�&}St�E��
UP7O�EU2��������U��Z��I�-��hY�Y����AV�� +@y��<
<+$���=w)9����S'g��-jK�����_$�;89}��A��������g�%��:����.T@�ln����z�vv�Fu�*qnf������5i��gWu!U��|���s��m]==���RXUI/G<Jh��^[��'�Y�{�r��{5t�zz�0�V�gU9�I�N�~�����C��=u4�u��"l�������-�������~�zX�@��O�^F�(���J�?y����3[����r�G���S����DC�
W�l�9��26���b�\�,��{����j��-?
�8��4q��N��.�5R����t�����\�-C�����t%�\S�5n�����#_������k�[Uvbt�.���Nm���/������F�
~���b�h�
��ZZ)������O@�_&U���z�W3�2����������as��zx�����444S������N���������(��pt�!2�?p��W3C�VU���Mm����������SU����s�:�|&���S����"����|�������=����Ytl���K����;���H>U���)��F��f����|��z]:n���w����s��:��^��$�7a���n�_:�I_C������,Y�+h��]�N��az<2�_3��k{�:����}��I�,/&h/���DP��*������>	{:����C�����]�VL����
��N���{���~��K��X�f����/���B�������?�r9����2��Nn���o.���Y��������������fMU�������:,����~L��v�����k�
�V%��������&������r
���g.(����|��)/�tpl�z���w)��*X8�W`����jS^�v�o���%}~_-�����)��F��f�G��&�u���i)s��Sg�/Zu����������\��o��6�[}
��!c},�,I��g<<r����k�?t������!����\9IC^;�������w6C#r��E�U1Y1;�����
�l����>��TgD+�����LB�����������L�NLR�k�8[ny��S���e������;z���k�4l��wd?�]!O3;ZVp���WE
�\{����9k�������*��W�UI��y�~����O��E>U�]���,����3���1ku�Y�����_.&#�Q���������>loq%5[>U���)�?����f���(BD�����Bg7������D���������]�%���.�el!.���u�k����T1�[VX����t���ftZ-iQlU�"����#��?���Mjh}1������J��y3}rb����7��N��-����U��)s�����o6�YL~FlD\���v��]Y�4>�f�:K��(i�-Y^RdbdPH���9�����
�_��p�����OL'�� ���,� ����f~xh���a�%�����-�r�Z-�������g�Z5�{z���K�'��OU2�����i�Y!>��*he��\�����X���9���!_��i����.���Z�Mu��jis���)�f���[v���;N�=��~��B$3%I�U���r�z�w���{��;��o�2,�%U�U�jf����:V��������>[>eX�W7"X�;���KLMH$��%��TD�y^�/	1�g�����~D��>o)�TlU�������������O�����J�/�]+��o��I����oX��l��o�9)�0z��W��[;�75� :��E�~��%�r��R��u���i�Y�}���
���>���F��n.��R"�D\�\R���X��jRAZ���~.�N�e�\�{D�W	EV���}c'=?[~j�W��o�����~���X�����E:��O�ma%��~��������@���{�(�.��}�&V���.�k�IOo{�p=�>|������ThU�6,Y���%��j��Zd��g�v��<��F3K�{9�\8{�W��)�0v��11�:�����"���������k���gU�}���
O��
�&�\��K�35��Z`��RL����������{N8uh�����)2�C���q#�K�nRr6!V�Z��J���/n��l^�r���=��k����3��Y��)���'}$oz6��[���������{z��[���>�n��������|�f���+��w�����g�O��n2D�����=[����'a��3 C��r��NM��U�T`U������=�okX�9���qLtb|��3�Z��*��R������YZUo���]~;�R��U����5�c��8w,7��{���W}�r���r�J���:�� 5�������<���>�x�Q��VmW+�����=��r��rQ�U3��>y�_�D��-l`����ck�+����Y���K���p� &�D�t=��^V�ca�CH��hB�3�r���_���4���h�	�s��Z��0���e��j�dy<8:�I]��W����|
K���}���/\0/>&H)����/����.:������$�>>��G�(��"J:��:���.n�����0E�\b������������[���r�J��OE��7T
6+X��b1c��q=.Z�	�������%�Jr���7���g'O��H�-lTRVQS������������Y�_���]�E3����l����o�������_7/��(�*U��=l7����'�����m����>C�i��V�f6���5[�X:U�&�C��}9��+EU���.fIZ\7������*�e�4j�����n�`�����%��l{��5V�GU�����F��'���t��?���������-��������1���+�����
SWU4v����;0��an�������m�/����GT���&�������P������#�ul����Q����rz����Pw��-�z{�X[�i
���G��4��B����{��LB��������#�Yg�.]�b���nC��86���rt��o��+$�!=����B�R�e���^����}`�0������>UlUV]��s|�/�,��`�~)���c���RLU�/�h�G�uij�=[���������6�����\�;x�*���p����M�e���*z����q��MuJV6��~����^�7�b�����#���-l�w+C��7u���yv���lZ�Y��V�?"W��}��?��}����p+�R������p�XUU��SN���L��
��S�O$uL�dZ�����R!e��;-���wN�T��wh�L���~|9�al��]K�6B������-%]��f��J������?��]`j����������V�5|JD����<���7uvw�\:�����'n��f�jS��W-Zy%d�������,�zzh���a�2�R�����sN����B�zg����?�)z'�CU�F��m�z}�?go?M��1��g7-5%9VvU^#�=�p>0!3{��s�u����,�����)��F�!g�H����a��A]��Y��ku�TD��z���RPv-Z�9����j:}�y�����L��Y;��.����Z�-kY��(9��������s���V��L��L�j���HE�	]=�������K]��kH����[UU�?�����4������AV�� +@y��<�
Pd(��Y�����?1��(c%IEND�B`�
#78Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#75)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-25 23:02:11 +0530, Dilip Kumar wrote:

On Fri, Mar 25, 2016 at 8:09 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Could anybody run benchmarks? Feature freeze is soon, but it would be
*very nice* to fit it into 9.6 release cycle, because it greatly improves
scalability on large machines. Without this patch PostgreSQL 9.6 will be
significantly behind competitors like MySQL 5.7.

I have run the performance and here are the results.. With latest patch I
did not see any regression at lower client count (median of 3 reading).

scale factor 1000 shared buffer 8GB readonly
*Client Base patch*
1 12957 13068
2 24931 25816
4 46311 48767
32 300921 310062
64 387623 493843
128 249635 583513
scale factor 300 shared buffer 8GB readonly
*Client Base patch*
1 14537 14586 --> one thread number looks little less, generally I get
~18000 (will recheck).
2 34703 33929 --> may be run to run variance (once I get time, will
recheck)
4 67744 69069
32 312575 336012
64 213312 539056
128 190139 380122

*Summary:*

Actually with 64 client we have seen ~470,000 TPS with head also, by
revering commit 6150a1b0.
refer this thread: (
/messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com
)

I haven't tested this patch by reverting commit 6150a1b0, so not sure can
this patch give even better performance ?

It also points to the case, what Andres has mentioned in this thread.

/messages/by-id/20160226191158.3vidtk3ktcmhimdu@alap3.anarazel.de

On what hardware did you run these tests?

Thanks,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#79Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#77)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-27 12:38:25 +0300, Alexander Korotkov wrote:

On Sat, Mar 26, 2016 at 1:26 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Thank you very much for testing!
I also got access to 4 x 18 Intel server with 144 threads. I'm going to
post results of tests on this server in next Monday.

I've run pgbench tests on this machine: pgbench -s 1000 -c $clients -j 100
-M prepared -T 300.
See results in the table and chart.

clients master v3 v5
1 11671 12507 12679
2 24650 26005 25010
4 49631 48863 49811
8 96790 96441 99946
10 121275 119928 124100
20 243066 243365 246432
30 359616 342241 357310
40 431375 415310 441619
50 489991 489896 500590
60 538057 636473 554069
70 588659 714426 738535
80 405008 923039 902632
90 295443 1181247 1155918
100 258695 1323125 1325019
110 238842 1393767 1410274
120 226018 1432504 1474982
130 215102 1465459 1503241
140 206415 1470454 1505380
150 197850 1475479 1519908
160 190935 1420915 1484868
170 185835 1438965 1453128
180 182519 1416252 1453098

My conclusions are following:
1) We don't observe any regression in v5 in comparison to master.
2) v5 in most of cases slightly outperforms v3.

What commit did you base these tests on? I guess something recent, after
98a64d0bd?

I'm going to do some code cleanup of v5 in Monday

Ok, I'll try to do a review and possibly commit after that.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#80Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#78)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Mar 27, 2016 at 5:37 PM, Andres Freund <andres@anarazel.de> wrote:

On what hardware did you run these tests?

IBM POWER 8 MACHINE.

Architecture: ppc64le
Byte Order: Little Endian
CPU(s): 192
Thread(s) per core: 8
Core(s) per socket: 1
Socket(s): 24
NUMA node(s): 4

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#81Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#80)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-27 17:45:52 +0530, Dilip Kumar wrote:

On Sun, Mar 27, 2016 at 5:37 PM, Andres Freund <andres@anarazel.de> wrote:

On what hardware did you run these tests?

IBM POWER 8 MACHINE.

Architecture: ppc64le
Byte Order: Little Endian
CPU(s): 192
Thread(s) per core: 8
Core(s) per socket: 1
Socket(s): 24
NUMA node(s): 4

What's sizeof(BufferDesc) after applying these patches? It should better
be <= 64...

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#82Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#79)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Mar 27, 2016 at 3:10 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-27 12:38:25 +0300, Alexander Korotkov wrote:

On Sat, Mar 26, 2016 at 1:26 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Thank you very much for testing!
I also got access to 4 x 18 Intel server with 144 threads. I'm going to
post results of tests on this server in next Monday.

I've run pgbench tests on this machine: pgbench -s 1000 -c $clients -j

100

-M prepared -T 300.
See results in the table and chart.

clients master v3 v5
1 11671 12507 12679
2 24650 26005 25010
4 49631 48863 49811
8 96790 96441 99946
10 121275 119928 124100
20 243066 243365 246432
30 359616 342241 357310
40 431375 415310 441619
50 489991 489896 500590
60 538057 636473 554069
70 588659 714426 738535
80 405008 923039 902632
90 295443 1181247 1155918
100 258695 1323125 1325019
110 238842 1393767 1410274
120 226018 1432504 1474982
130 215102 1465459 1503241
140 206415 1470454 1505380
150 197850 1475479 1519908
160 190935 1420915 1484868
170 185835 1438965 1453128
180 182519 1416252 1453098

My conclusions are following:
1) We don't observe any regression in v5 in comparison to master.
2) v5 in most of cases slightly outperforms v3.

What commit did you base these tests on? I guess something recent, after
98a64d0bd?

Yes, more recent than 98a64d0bd. It was based on 676265eb7b.

I'm going to do some code cleanup of v5 in Monday

Ok, I'll try to do a review and possibly commit after that.

Sounds good.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#83Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#81)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Mar 27, 2016 at 5:48 PM, Andres Freund <andres@anarazel.de> wrote:

What's sizeof(BufferDesc) after applying these patches? It should better
be <= 64...

It is 72.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#84Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#82)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Mar 27, 2016 at 4:31 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Mar 27, 2016 at 3:10 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-27 12:38:25 +0300, Alexander Korotkov wrote:

On Sat, Mar 26, 2016 at 1:26 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Thank you very much for testing!
I also got access to 4 x 18 Intel server with 144 threads. I'm going

to

post results of tests on this server in next Monday.

I've run pgbench tests on this machine: pgbench -s 1000 -c $clients -j

100

-M prepared -T 300.
See results in the table and chart.

clients master v3 v5
1 11671 12507 12679
2 24650 26005 25010
4 49631 48863 49811
8 96790 96441 99946
10 121275 119928 124100
20 243066 243365 246432
30 359616 342241 357310
40 431375 415310 441619
50 489991 489896 500590
60 538057 636473 554069
70 588659 714426 738535
80 405008 923039 902632
90 295443 1181247 1155918
100 258695 1323125 1325019
110 238842 1393767 1410274
120 226018 1432504 1474982
130 215102 1465459 1503241
140 206415 1470454 1505380
150 197850 1475479 1519908
160 190935 1420915 1484868
170 185835 1438965 1453128
180 182519 1416252 1453098

My conclusions are following:
1) We don't observe any regression in v5 in comparison to master.
2) v5 in most of cases slightly outperforms v3.

What commit did you base these tests on? I guess something recent, after
98a64d0bd?

Yes, more recent than 98a64d0bd. It was based on 676265eb7b.

I'm going to do some code cleanup of v5 in Monday

Ok, I'll try to do a review and possibly commit after that.

Sounds good.

There is next revision of patch. It contains mostly cosmetic changes.
Comment are adjusted to reflect changes of behaviour.
I also changed atomic AND/OR for local buffers to read/write pair which
would be cheaper I suppose. However, I don't insist on it, and it could be
reverted.
The patch is ready for your review. It's especially interesting what do
you think about the way I abstracted exponential back off of spinlock.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-6.patchapplication/octet-stream; name=pinunpin-cas-6.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..35b5ee9
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,175 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
--- 161,176 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..fe6fb9c
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 52,58 ****
  #include "utils/resowner_private.h"
  #include "utils/timestamp.h"
  
- 
  /* Note: these two macros only work on shared buffers, not local ones! */
  #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
  #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
--- 52,57 ----
*************** static BufferDesc *PinCountWaitBuf = NUL
*** 163,169 ****
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a spinlock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
--- 162,168 ----
   * entry using ReservePrivateRefCountEntry() and then later, if necessary,
   * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
   * memory allocations in NewPrivateRefCountEntry() which can be important
!  * because in some scenarios it's called with a header lock held...
   */
  static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
  static HTAB *PrivateRefCountHash = NULL;
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 439,445 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 815,823 ****
  		 */
  		if (isLocalBuf)
  		{
! 			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 814,827 ----
  		 */
  		if (isLocalBuf)
  		{
! 			/*
! 			 * Since it's local buffer, there is no concurrency.  We assume
! 			 * read/write pair to be cheaper than atomic AND.
! 			 */
! 			uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 832,841 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~(BM_VALID | BM_LOCKED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 852,858 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 932,939 ****
  
  	if (isLocalBuf)
  	{
! 		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 936,948 ----
  
  	if (isLocalBuf)
  	{
! 		/*
! 		 * Only need to adjust flags.  Since it's local buffer, there is no
! 		 * concurrency.  We assume read/write pair to be cheaper than atomic OR.
! 		 */
! 		uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 		state |= BM_VALID;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 996,1006 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1050,1072 ****
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the spinlock's not yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
! 		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
! 		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
  
  		/*
--- 1060,1082 ----
  	for (;;)
  	{
  		/*
! 		 * Ensure, while the header lock isn't yet held, that there's a free
  		 * refcount entry.
  		 */
  		ReservePrivateRefCountEntry();
  
  		/*
  		 * Select a victim buffer.  The buffer is returned with its header
! 		 * lock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
! 		/* Must copy buffer flags while we still hold the header lock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
! 		/* Pin the buffer and then release the buffer header lock */
  		PinBuffer_Locked(buf);
  
  		/*
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1264,1270 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1269 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
--- 1272,1279 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
  		UnlockBufHdr(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1294,1308 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_LOCKED | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	pg_atomic_write_u32(&buf->state, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1319,1325 ****
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header spinlock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
--- 1330,1336 ----
   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
   * freelist.
   *
!  * The buffer header lock must be held at entry.  We drop it before
   * returning.  (This is sane because the caller must have locked the
   * buffer in order to be sure it should be dropped.)
   *
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1346 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
! 	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
--- 1349,1358 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
! 	/* Save the original buffer tag before dropping the header lock */
  	oldTag = buf->tag;
  
  	UnlockBufHdr(buf);
*************** retry:
*** 1362,1368 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
--- 1374,1380 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
*************** retry:
*** 1381,1387 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
--- 1393,1399 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
  		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1408,1417 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** void
*** 1433,1438 ****
--- 1443,1449 ----
  MarkBufferDirty(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1449,1462 ****
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
--- 1460,1473 ----
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	state = LockBufHdr(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(state & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
*************** MarkBufferDirty(Buffer buffer)
*** 1464,1472 ****
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 
! 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1475,1483 ----
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1506,1512 ****
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without spinlock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
--- 1517,1523 ----
  		else
  		{
  			bufHdr = GetBufferDescriptor(buffer - 1);
! 			/* we have pin, so it's ok to examine tag without header lock */
  			if (bufHdr->tag.blockNum == blockNum &&
  				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
  				bufHdr->tag.forkNum == forkNum)
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1531,1540 ****
   *
   * This should be applied only to shared buffers, never local ones.
   *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra spinlock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
--- 1542,1556 ----
   *
   * This should be applied only to shared buffers, never local ones.
   *
+  * Since buffers are pinned/unpinned very frequently, this functions tries
+  * to pin buffer as cheap as possible.  This is why we don't take buffer header
+  * lock here, but update state variable in loop of CAS operations. Hopefully.
+  * it would be just single CAS.
+  *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
!  * some callers to avoid an extra header lock cycle.
   */
  static bool
  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
  		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
  		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1563,1606 ----
  
  	if (ref == NULL)
  	{
+ 		/* loop of CAS operations */
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
! 
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
! 
! 			/* increase refcount */
! 			state += BUF_REFCOUNT_ONE;
! 
! 			/* increase usagecount unless already max */
! 			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 				state += BUF_USAGECOUNT_ONE;
! 
! 			/* try to do CAS, exit on success */
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1579,1587 ****
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The spinlock is released before return.
   *
!  * As this function is called with the spinlock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
--- 1616,1624 ----
  
  /*
   * PinBuffer_Locked -- as above, but caller already locked the buffer header.
!  * The header lock is released before return.
   *
!  * As this function is called with the header lock held, the caller has to
   * previously call ReservePrivateRefCountEntry().
   *
   * Currently, no callers of this function want to modify the buffer's
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1592,1598 ****
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * spinlock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
--- 1629,1635 ----
   * Also all callers only ever use this function when it's known that the
   * buffer can't have a preexisting pin by this backend. That allows us to skip
   * searching the private refcount array & hash, which is a boon, because the
!  * header lock is still held.
   *
   * Note: use of this routine is frequently mandatory, not just an optimization
   * to save a spin lock/unlock cycle, because we need to pin a buffer before
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1617 ****
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the spinlock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1640,1661 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
! 	 * manipulate the PrivateRefCount after releasing the header lock
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	/*
! 	 * Since we assume to held buffer header lock, we can update the buffer
! 	 * state in a single write operation.
! 	 */
! 	state = pg_atomic_read_u32(&buf->state);
! 	state += 1;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
! 		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
  		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1690,1755 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 		uint32			oldstate;
+ 		SpinDelayStatus	delayStatus;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Since buffer header lock holder can update status using just write,
! 		 * it's not safe to use atomic decrement here.  We are doing loop of
! 		 * CAS operations like PinBuffer does.
! 		 */
  
! 		state = pg_atomic_read_u32(&buf->state);
! 		oldstate = state;
  
! 		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);
! 		while (true)
  		{
! 			/* spin-wait till lock is free */
! 			while (state & BM_LOCKED)
! 			{
! 				make_spin_delay(&delayStatus);
! 				state = pg_atomic_read_u32(&buf->state);
! 				oldstate = state;
! 			}
  
! 			/* decrease refcount */
! 			Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 			state -= 1;
! 
! 			/* try to do CAS, exit on success */
! 			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
! 				break;
! 
! 			/* get ready for next loop, oldstate has been updated by cas */
! 			state = oldstate;
  		}
! 		finish_spin_delay(&delayStatus);
! 
! 		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
! 		{
! 			state = LockBufHdr(buf);
! 
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED);
+ 				pg_atomic_write_u32(&buf->state, state);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1767,1773 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1733,1748 ****
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1814,1829 ----
  		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  
  		/*
! 		 * Header lock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1833,1840 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1970,1976 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2340,2346 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2280 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
--- 2354,2366 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2282,2288 ****
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
--- 2368,2374 ----
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
  		UnlockBufHdr(bufHdr);
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2525,2531 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2543,2555 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** BufferGetBlockNumber(Buffer buffer)
*** 2516,2522 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	return bufHdr->tag.blockNum;
  }
  
--- 2604,2610 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	return bufHdr->tag.blockNum;
  }
  
*************** BufferGetTag(Buffer buffer, RelFileNode 
*** 2539,2545 ****
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without spinlock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
--- 2627,2633 ----
  	else
  		bufHdr = GetBufferDescriptor(buffer - 1);
  
! 	/* pinned, so OK to read tag without lock */
  	*rnode = bufHdr->tag.rnode;
  	*forknum = bufHdr->tag.forkNum;
  	*blknum = bufHdr->tag.blockNum;
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2661,2667 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2687,2693 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2696,2703 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~(BM_JUST_DIRTIED | BM_LOCKED);
! 	pg_atomic_write_u32(&buf->state, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2716,2722 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2715,2727 ****
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header spinlock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2804,2816 ----
  
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
! 	 * need not bother with the buffer header lock.  Even if someone else
  	 * changes the buffer header flags while we're doing this, we assume that
  	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
  	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2821,2827 ****
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 2910,2916 ----
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2919,2925 ****
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 3008,3014 ----
  
  		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** DropDatabaseBuffers(Oid dbid)
*** 2961,2967 ****
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
--- 3050,3056 ----
  
  		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
! 			InvalidateBuffer(bufHdr);	/* releases lock */
  		else
  			UnlockBufHdr(bufHdr);
  	}
*************** FlushRelationBuffers(Relation rel)
*** 3057,3063 ****
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3146,3153 ----
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(pg_atomic_read_u32(&bufHdr->state) & (BM_VALID | BM_DIRTY))
! 				== (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3168,3174 ----
  						  localpage,
  						  false);
  
! 				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3183,3190 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3196,3204 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3237,3243 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3249,3257 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3290,3308 ****
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring spinlock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3383,3402 ----
  	 * This routine might get called many times on the same page, if we are
  	 * making the first scan after commit of an xact that added/deleted many
  	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
! 	 * do this by not acquiring header lock if it looks like the status bits are
  	 * already set.  Since we make this test unlocked, there's a chance we
  	 * might fail to notice that the flags have just been cleared, and failed
  	 * to reset them, due to memory-ordering issues.  But since this function
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3407,3413 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3446,3456 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3470,3479 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3504,3523 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3610,3640 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
  			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
  			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3663,3674 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3709,3734 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3634 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
--- 3736,3746 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
  		UnlockBufHdr(bufHdr);
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
! 		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
  		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3778,3794 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
! 		 * It may not be necessary to acquire the header lock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
  		UnlockBufHdr(buf);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3816,3823 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3828,3836 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3732,3738 ****
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
--- 3846,3852 ----
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
  		UnlockBufHdr(buf);
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3740,3748 ****
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3854,3862 ----
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	state &= ~BM_LOCKED;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3882,3903 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR | BM_LOCKED);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	pg_atomic_write_u32(&buf->state, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3920,3926 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3836 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
--- 3929,3952 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
  			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
  			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
! 				/* Buffer is pinned, so we can read tag without header lock */
  				char	   *path;
  
  				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
*************** shared_buffer_write_error_callback(void 
*** 3854,3860 ****
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the spinlock */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
--- 3970,3976 ----
  {
  	BufferDesc *bufHdr = (BufferDesc *) arg;
  
! 	/* Buffer is pinned, so we can read the tag without locking the header */
  	if (bufHdr != NULL)
  	{
  		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4028,4075 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(volatile BufferDesc *desc)
+ {
+ 	SpinDelayStatus	delayStatus;
+ 	uint32			state;
+ 
+ 	init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+ 
+ 	state = pg_atomic_read_u32(&desc->state);
+ 
+ 	for (;;)
+ 	{
+ 		/* wait till lock is free */
+ 		while (state & BM_LOCKED)
+ 		{
+ 			make_spin_delay(&delayStatus);
+ 			state = pg_atomic_read_u32(&desc->state);
+ 			/* Add exponential backoff? Should seldomly be contended tho. */
+ 		}
+ 
+ 		/* and try to get lock */
+ 		if (pg_atomic_compare_exchange_u32(&desc->state, &state, state | BM_LOCKED))
+ 			break;
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return state | BM_LOCKED;
+ }
+ 
+ /*
+  * Unlock buffer header - unset BM_LOCKED in buffer state.
+  */
+ void
+ UnlockBufHdr(volatile BufferDesc *desc)
+ {
+ 	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+ 
+ 	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..8b99824
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,289 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
  			UnlockBufHdr(buf);
--- 282,294 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  			UnlockBufHdr(buf);
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,625 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
  	UnlockBufHdr(buf);
--- 628,639 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
  	UnlockBufHdr(buf);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..a2edf00
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
! 	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
  
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,150 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line)
! {
! 	status->spins = 0;
! 	status->delays = 0;
! 	status->cur_delay = 0;
! 	status->ptr = ptr;
! 	status->file = file;
! 	status->line = line;
! }
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
! 	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 155,180 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus;
! 
! 	init_spin_delay(&delayStatus, (Pointer)lock, file, line);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..2cfacd8
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,161 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
!  *
!  * It's assumed that nobody changes the state field while buffer header lock
!  * is held.  Thanks to it, buffer header lock holder can do complex updates of
!  * state variable in single write simultaneously with lock release (cleaning
!  * BM_LOCKED flag).  On the other hand, updating of state without holding
!  * buffer header lock is restricted to CAS which insure that BM_LOCKED flag
!  * is not set.  Atomic increment/decrement, OR/AND etc are not allowed.
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,153 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 173,183 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 232,242 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers! FIXUP!
   */
! extern uint32 LockBufHdr(volatile BufferDesc *desc);
! extern void UnlockBufHdr(volatile BufferDesc *desc);
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 297,304 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..cc6c195
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1012 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ void init_spin_delay(SpinDelayStatus *status, Pointer ptr, const char *file, int line);
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#85Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#83)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-28 11:48:46 +0530, Dilip Kumar wrote:

On Sun, Mar 27, 2016 at 5:48 PM, Andres Freund <andres@anarazel.de> wrote:

What's sizeof(BufferDesc) after applying these patches? It should better
be <= 64...

It is 72.

Ah yes, miscalculated the required alignment. Hm. So we got to get this
smaller. I see three approaches:

1) Reduce the spinlock size on ppc. That actually might just work by
replacing "unsigned int" by "unsigned char"
2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.
3) Shrink the size of BufferDesc by removing buf_id; that'd bring it to
64byte.

I'm a bit hesitant to go for 3), because it'd likely end up adding a bit
of arithmetic to a number of places in bufmgr.c. Robert, what do you
think?

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#86Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#84)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:

diff --git a/src/backend/storage/buffer/bufmnew file mode 100644
index 6dd7c6e..fe6fb9c
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -52,7 +52,6 @@
#include "utils/resowner_private.h"
#include "utils/timestamp.h"

-
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))

spurious

@@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha
* it's not been recycled) but come right back here to try smgrextend
* again.
*/
-	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
+	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */

bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);

@@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha

if (isLocalBuf)
{
-		/* Only need to adjust flags */
-		bufHdr->flags |= BM_VALID;
+		/*
+		 * Only need to adjust flags.  Since it's local buffer, there is no
+		 * concurrency.  We assume read/write pair to be cheaper than atomic OR.
+		 */
+		uint32 state = pg_atomic_read_u32(&bufHdr->state);
+		state |= BM_VALID;
+		pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.

}
else
{
@@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp
BufferTag oldTag; /* previous identity of selected buffer */
uint32 oldHash; /* hash value for oldTag */
LWLock *oldPartitionLock; /* buffer partition lock for it */
- BufFlags oldFlags;
+ uint32 oldFlags;
int buf_id;
BufferDesc *buf;
bool valid;
+ uint32 state;

/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp
for (;;)
{
/*
-		 * Ensure, while the spinlock's not yet held, that there's a free
+		 * Ensure, while the header lock isn't yet held, that there's a free
* refcount entry.
*/
ReservePrivateRefCountEntry();
/*
* Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!
+		 * lock still held!
*/
-		buf = StrategyGetBuffer(strategy);
+		buf = StrategyGetBuffer(strategy, &state);

The new thing really still is a spinlock, not sure if it's worth
changing the comments that way.

@@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp
* InvalidateBuffer -- mark a shared buffer invalid and return it to the
* freelist.
*
- * The buffer header spinlock must be held at entry.  We drop it before
+ * The buffer header lock must be held at entry.  We drop it before
* returning.  (This is sane because the caller must have locked the
* buffer in order to be sure it should be dropped.)
*

Ok, by now I'm pretty sure that I don't want this to be changed
everywhere, just makes reviewing harder.

@@ -1433,6 +1443,7 @@ void
MarkBufferDirty(Buffer buffer)
{
BufferDesc *bufHdr;
+ uint32 state;

if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer ID: %d", buffer);
@@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer)
/* unfortunately we can't check if the lock is held exclusively */
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));

-	LockBufHdr(bufHdr);
+	state = LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
+	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);

/*
* If the buffer was not dirty already, do vacuum accounting.
*/
- if (!(bufHdr->flags & BM_DIRTY))
+ if (!(state & BM_DIRTY))
{
VacuumPageDirty++;
pgBufferUsage.shared_blks_dirtied++;
@@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer)
VacuumCostBalance += VacuumCostPageDirty;
}

-	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-
-	UnlockBufHdr(bufHdr);
+	state |= BM_DIRTY | BM_JUST_DIRTIED;
+	state &= ~BM_LOCKED;
+	pg_atomic_write_u32(&bufHdr->state, state);
}

Hm, this is a routine I think we should avoid taking the lock in; it's
often called quite frequently. Also doesn't look very hard.

static bool
PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
@@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS

if (ref == NULL)
{
+		/* loop of CAS operations */
+		uint32			state;
+		uint32			oldstate;
+		SpinDelayStatus	delayStatus;

ReservePrivateRefCountEntry();
ref = NewPrivateRefCountEntry(b);

-		LockBufHdr(buf);
-		buf->refcount++;
-		if (strategy == NULL)
-		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
-		}
-		else
+		state = pg_atomic_read_u32(&buf->state);
+		oldstate = state;
+
+		init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);

Hm. This way we're calling this on every iteration. That doesn't seem
like a good idea. How about making delayStatus a static, and
init_spin_delay a macro which returns a {struct, member, one, two} type
literal?

+		while (true)
{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
+			/* spin-wait till lock is free */
+			while (state & BM_LOCKED)
+			{
+				make_spin_delay(&delayStatus);
+				state = pg_atomic_read_u32(&buf->state);
+				oldstate = state;
+			}

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).

+			/* increase refcount */
+			state += BUF_REFCOUNT_ONE;
+
+			/* increase usagecount unless already max */
+			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
+				state += BUF_USAGECOUNT_ONE;
+
+			/* try to do CAS, exit on success */

Seems like a somewhat obvious comment?

@@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf)
{

Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);

-	buf->refcount++;
-	UnlockBufHdr(buf);
+	/*
+	 * Since we assume to held buffer header lock, we can update the buffer
+	 * state in a single write operation.
+	 */
+	state = pg_atomic_read_u32(&buf->state);
+	state += 1;
+	state &= ~BM_LOCKED;
+	pg_atomic_write_u32(&buf->state, state);

Te comment should probably mention that we're releasing the
spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard
to understand.

@@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn
ref->refcount--;
if (ref->refcount == 0)
{

+
+		/* Support LockBufferForCleanup() */
+		if (state & BM_PIN_COUNT_WAITER)
+		{
+			state = LockBufHdr(buf);
+
+			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
+			{
+				/* we just released the last pin other than the waiter's */
+				int			wait_backend_pid = buf->wait_backend_pid;
+				state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED);
+				pg_atomic_write_u32(&buf->state, state);
+				ProcSendSignal(wait_backend_pid);
+			}
+			else
+				UnlockBufHdr(buf);
+		}

I think it's quite confusing to use UnlockBufHdr and direct bit
expressions in one branch.

Thinking about it I also don't think the pg_atomic_write_u32 variant is
correct without adding a write barrier; the other side might not see the
values yet.

I think we can just redefine UnlockBufHdr() to be a
pg_atomic_write_u32() and pg_write_memory_barrier()?

* BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
-	 * need not bother with the buffer header spinlock.  Even if someone else
+	 * need not bother with the buffer header lock.  Even if someone else
* changes the buffer header flags while we're doing this, we assume that
* changing an aligned 2-byte BufFlags value is atomic, so we'll read the
* old value or the new value, but not random garbage.
*/

The rest of the comment is outdated, BufFlags isn't a 2 byte value
anymore.

@@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel)
localpage,
false);

-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));

Hm, in other places you replaced atomics on local buffers with plain
writes.

lsn = XLogSaveBufferForHint(buffer, buffer_std);
}

-		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (!(bufHdr->flags & BM_DIRTY))
+		state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+
+		if (!(state & BM_DIRTY))
{
dirtied = true;		/* Means "will be dirtied by this action" */

It's again worthwhile to try make this work without taking the lock.

-	buf->flags |= BM_IO_IN_PROGRESS;
-
-	UnlockBufHdr(buf);
+	state |= BM_IO_IN_PROGRESS;
+	state &= ~BM_LOCKED;
+	pg_atomic_write_u32(&buf->state, state);

How about making UnlockBufHdr() take a new state parameter, and
internally unset BM_LOCKED?

/*
+ * Lock buffer header - set BM_LOCKED in buffer state.
+ */
+uint32
+LockBufHdr(volatile BufferDesc *desc)
+{
+	SpinDelayStatus	delayStatus;
+	uint32			state;
+
+	init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+
+	state = pg_atomic_read_u32(&desc->state);
+
+	for (;;)
+	{
+		/* wait till lock is free */
+		while (state & BM_LOCKED)
+		{
+			make_spin_delay(&delayStatus);
+			state = pg_atomic_read_u32(&desc->state);
+			/* Add exponential backoff? Should seldomly be contended tho. */

Outdated comment.

+/*
+ * Unlock buffer header - unset BM_LOCKED in buffer state.
+ */
+void
+UnlockBufHdr(volatile BufferDesc *desc)
+{
+	Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+
+	pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+}

As suggested above, there's likely no need to use an actual atomic op
here.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#87Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Andres Freund (#86)
Re: Move PinBuffer and UnpinBuffer to atomics

Andres Freund wrote:

On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:

@@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha

if (isLocalBuf)
{
-		/* Only need to adjust flags */
-		bufHdr->flags |= BM_VALID;
+		/*
+		 * Only need to adjust flags.  Since it's local buffer, there is no
+		 * concurrency.  We assume read/write pair to be cheaper than atomic OR.
+		 */
+		uint32 state = pg_atomic_read_u32(&bufHdr->state);
+		state |= BM_VALID;
+		pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.

Perhaps a set of macros should be offered that do "read, set some flag,
write". Then the comments can be attached to the macro instead of to
each callsite.

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#88Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#85)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Mar 28, 2016 at 9:09 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-28 11:48:46 +0530, Dilip Kumar wrote:

On Sun, Mar 27, 2016 at 5:48 PM, Andres Freund <andres@anarazel.de> wrote:

What's sizeof(BufferDesc) after applying these patches? It should better
be <= 64...

It is 72.

Ah yes, miscalculated the required alignment. Hm. So we got to get this
smaller. I see three approaches:

1) Reduce the spinlock size on ppc. That actually might just work by
replacing "unsigned int" by "unsigned char"
2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.
3) Shrink the size of BufferDesc by removing buf_id; that'd bring it to
64byte.

I'm a bit hesitant to go for 3), because it'd likely end up adding a bit
of arithmetic to a number of places in bufmgr.c. Robert, what do you
think?

I don't have a clear idea what's going to be better here.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#89Andres Freund
andres@anarazel.de
In reply to: Robert Haas (#88)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-29 13:09:05 -0400, Robert Haas wrote:

On Mon, Mar 28, 2016 at 9:09 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-28 11:48:46 +0530, Dilip Kumar wrote:

On Sun, Mar 27, 2016 at 5:48 PM, Andres Freund <andres@anarazel.de> wrote:

What's sizeof(BufferDesc) after applying these patches? It should better
be <= 64...

It is 72.

Ah yes, miscalculated the required alignment. Hm. So we got to get this
smaller. I see three approaches:

1) Reduce the spinlock size on ppc. That actually might just work by
replacing "unsigned int" by "unsigned char"
2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.
3) Shrink the size of BufferDesc by removing buf_id; that'd bring it to
64byte.

I'm a bit hesitant to go for 3), because it'd likely end up adding a bit
of arithmetic to a number of places in bufmgr.c. Robert, what do you
think?

I don't have a clear idea what's going to be better here.

My gut feeling is that we should do both 1) and 2).

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#90Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#86)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi, Andres!

Please, find next revision of patch in attachment.

On Mon, Mar 28, 2016 at 4:59 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:

diff --git a/src/backend/storage/buffer/bufmnew file mode 100644
index 6dd7c6e..fe6fb9c
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -52,7 +52,6 @@
#include "utils/resowner_private.h"
#include "utils/timestamp.h"

-
/* Note: these two macros only work on shared buffers, not local ones!

*/

#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size)

(bufHdr)->buf_id) * BLCKSZ))

#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))

spurious

Fixed.

@@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha
* it's not been recycled) but come right back here to try

smgrextend

* again.
*/
- Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not

needed */

+ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /*

header lock not needed */

bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) :

BufHdrGetBlock(bufHdr);

@@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha

if (isLocalBuf)
{
-             /* Only need to adjust flags */
-             bufHdr->flags |= BM_VALID;
+             /*
+              * Only need to adjust flags.  Since it's local buffer,

there is no

+ * concurrency. We assume read/write pair to be cheaper

than atomic OR.

+              */
+             uint32 state = pg_atomic_read_u32(&bufHdr->state);
+             state |= BM_VALID;
+             pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.

Moved comments into single place where macros for CAS loop is defined (see
my comments below).

}
else
{
@@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp
BufferTag oldTag; /* previous identity of

selected buffer */

uint32 oldHash; /* hash value for oldTag */
LWLock *oldPartitionLock; /* buffer partition lock

for it */

-     BufFlags        oldFlags;
+     uint32          oldFlags;
int                     buf_id;
BufferDesc *buf;
bool            valid;
+     uint32          state;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp
for (;;)
{
/*
-              * Ensure, while the spinlock's not yet held, that there's

a free

+ * Ensure, while the header lock isn't yet held, that

there's a free

* refcount entry.
*/
ReservePrivateRefCountEntry();

/*
* Select a victim buffer. The buffer is returned with

its header

-              * spinlock still held!
+              * lock still held!
*/
-             buf = StrategyGetBuffer(strategy);
+             buf = StrategyGetBuffer(strategy, &state);

The new thing really still is a spinlock, not sure if it's worth
changing the comments that way.

@@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp
* InvalidateBuffer -- mark a shared buffer invalid and return it to the
* freelist.
*
- * The buffer header spinlock must be held at entry.  We drop it before
+ * The buffer header lock must be held at entry.  We drop it before
* returning.  (This is sane because the caller must have locked the
* buffer in order to be sure it should be dropped.)
*

Ok, by now I'm pretty sure that I don't want this to be changed
everywhere, just makes reviewing harder.

Fixed.

@@ -1433,6 +1443,7 @@ void
MarkBufferDirty(Buffer buffer)
{
BufferDesc *bufHdr;
+ uint32 state;

if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer ID: %d", buffer);
@@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer)
/* unfortunately we can't check if the lock is held exclusively */
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));

-     LockBufHdr(bufHdr);
+     state = LockBufHdr(bufHdr);
-     Assert(bufHdr->refcount > 0);
+     Assert(BUF_STATE_GET_REFCOUNT(state) > 0);

/*
* If the buffer was not dirty already, do vacuum accounting.
*/
- if (!(bufHdr->flags & BM_DIRTY))
+ if (!(state & BM_DIRTY))
{
VacuumPageDirty++;
pgBufferUsage.shared_blks_dirtied++;
@@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer)
VacuumCostBalance += VacuumCostPageDirty;
}

-     bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-
-     UnlockBufHdr(bufHdr);
+     state |= BM_DIRTY | BM_JUST_DIRTIED;
+     state &= ~BM_LOCKED;
+     pg_atomic_write_u32(&bufHdr->state, state);
}

Hm, this is a routine I think we should avoid taking the lock in; it's
often called quite frequently. Also doesn't look very hard.

Done.

static bool
PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
@@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS

if (ref == NULL)
{
+             /* loop of CAS operations */
+             uint32                  state;
+             uint32                  oldstate;
+             SpinDelayStatus delayStatus;

ReservePrivateRefCountEntry();
ref = NewPrivateRefCountEntry(b);

-             LockBufHdr(buf);
-             buf->refcount++;
-             if (strategy == NULL)
-             {
-                     if (buf->usage_count < BM_MAX_USAGE_COUNT)
-                             buf->usage_count++;
-             }
-             else
+             state = pg_atomic_read_u32(&buf->state);
+             oldstate = state;
+
+             init_spin_delay(&delayStatus, (Pointer)buf, __FILE__,

__LINE__);

Hm. This way we're calling this on every iteration. That doesn't seem
like a good idea. How about making delayStatus a static, and
init_spin_delay a macro which returns a {struct, member, one, two} type
literal?

Done.

+             while (true)
{
-                     if (buf->usage_count == 0)
-                             buf->usage_count = 1;
+                     /* spin-wait till lock is free */
+                     while (state & BM_LOCKED)
+                     {
+                             make_spin_delay(&delayStatus);
+                             state = pg_atomic_read_u32(&buf->state);
+                             oldstate = state;
+                     }

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).

I have some doubts about such function. At first, I can't find a place for
it in lwlock.c. It doesn't run explicit spin delays yet. Is it related to
some changes you're planning for lwlock.c.
At second, I doubt about it's signature. It would be logical if it would
be "uint32 pg_atomic_wait_bit_unset_u32(pg_atomic_u32 *var, uint32 mask)".
But in this case we have to do extra pg_atomic_read_u32 after unsuccessful
CAS. And it causes some small regression. Especially unpleasant that it's
also regression in comparison with master on low clients.

But there is code duplication which would be very nice to evade. I end up
with macros which encapsulates CAS loop.

BEGIN_BUFSTATE_CAS_LOOP(buf);
state |= BM_LOCKED;
END_BUFSTATE_CAS_LOOP(buf);

For me, it simplifies readability a lot.

+                     /* increase refcount */
+                     state += BUF_REFCOUNT_ONE;
+
+                     /* increase usagecount unless already max */
+                     if (BUF_STATE_GET_USAGECOUNT(state) !=

BM_MAX_USAGE_COUNT)

+                             state += BUF_USAGECOUNT_ONE;
+
+                     /* try to do CAS, exit on success */

Seems like a somewhat obvious comment?

Comment removed.

@@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf)
{

Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf),

false) == NULL);

-     buf->refcount++;
-     UnlockBufHdr(buf);
+     /*
+      * Since we assume to held buffer header lock, we can update the

buffer

+      * state in a single write operation.
+      */
+     state = pg_atomic_read_u32(&buf->state);
+     state += 1;
+     state &= ~BM_LOCKED;
+     pg_atomic_write_u32(&buf->state, state);

Te comment should probably mention that we're releasing the
spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard
to understand.

Fixed.

@@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn
ref->refcount--;
if (ref->refcount == 0)
{

+
+             /* Support LockBufferForCleanup() */
+             if (state & BM_PIN_COUNT_WAITER)
+             {
+                     state = LockBufHdr(buf);
+
+                     if (state & BM_PIN_COUNT_WAITER &&

BUF_STATE_GET_REFCOUNT(state) == 1)

+                     {
+                             /* we just released the last pin other

than the waiter's */

+ int wait_backend_pid =

buf->wait_backend_pid;

+ state &= ~(BM_PIN_COUNT_WAITER |

BM_LOCKED);

+                             pg_atomic_write_u32(&buf->state, state);
+                             ProcSendSignal(wait_backend_pid);
+                     }
+                     else
+                             UnlockBufHdr(buf);
+             }

I think it's quite confusing to use UnlockBufHdr and direct bit
expressions in one branch.

Thinking about it I also don't think the pg_atomic_write_u32 variant is
correct without adding a write barrier; the other side might not see the
values yet.

I think we can just redefine UnlockBufHdr() to be a
pg_atomic_write_u32() and pg_write_memory_barrier()?

Done.

* BM_PERMANENT can't be changed while we hold a pin on the

buffer, so we

- * need not bother with the buffer header spinlock. Even if

someone else

+ * need not bother with the buffer header lock. Even if someone

else

* changes the buffer header flags while we're doing this, we

assume that

* changing an aligned 2-byte BufFlags value is atomic, so we'll

read the

* old value or the new value, but not random garbage.
*/

The rest of the comment is outdated, BufFlags isn't a 2 byte value
anymore.

Fixed.

@@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel)
localpage,
false);

- bufHdr->flags &= ~(BM_DIRTY |

BM_JUST_DIRTIED);

+ pg_atomic_fetch_and_u32(&bufHdr->state,

~(BM_DIRTY | BM_JUST_DIRTIED));

Hm, in other places you replaced atomics on local buffers with plain
writes.

Fixed.

lsn = XLogSaveBufferForHint(buffer, buffer_std);
}

-             LockBufHdr(bufHdr);
-             Assert(bufHdr->refcount > 0);
-             if (!(bufHdr->flags & BM_DIRTY))
+             state = LockBufHdr(bufHdr);
+
+             Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+
+             if (!(state & BM_DIRTY))
{
dirtied = true;         /* Means "will be dirtied

by this action" */

It's again worthwhile to try make this work without taking the lock.

Comment there claims.

/*
* Set the page LSN if we wrote a backup block. We aren't supposed
* to set this when only holding a share lock but as long as we
* serialise it somehow we're OK. We choose to set LSN while
* holding the buffer header lock, which causes any reader of an
* LSN who holds only a share lock to also obtain a buffer header
* lock before using PageGetLSN(), which is enforced in
* BufferGetLSNAtomic().

Thus, buffer header lock is used for write serialization. I don't think it
would be correct to remove the lock here.

-     buf->flags |= BM_IO_IN_PROGRESS;
-
-     UnlockBufHdr(buf);
+     state |= BM_IO_IN_PROGRESS;
+     state &= ~BM_LOCKED;
+     pg_atomic_write_u32(&buf->state, state);

How about making UnlockBufHdr() take a new state parameter, and
internally unset BM_LOCKED?

Done.

/*
+ * Lock buffer header - set BM_LOCKED in buffer state.
+ */
+uint32
+LockBufHdr(volatile BufferDesc *desc)
+{
+     SpinDelayStatus delayStatus;
+     uint32                  state;
+
+     init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
+
+     state = pg_atomic_read_u32(&desc->state);
+
+     for (;;)
+     {
+             /* wait till lock is free */
+             while (state & BM_LOCKED)
+             {
+                     make_spin_delay(&delayStatus);
+                     state = pg_atomic_read_u32(&desc->state);
+                     /* Add exponential backoff? Should seldomly be

contended tho. */

Outdated comment.

Fixed.

+/*
+ * Unlock buffer header - unset BM_LOCKED in buffer state.
+ */
+void
+UnlockBufHdr(volatile BufferDesc *desc)
+{
+     Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+
+     pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
+}

As suggested above, there's likely no need to use an actual atomic op
here.

Fixed. Actually, this function is removed now.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-7.patchapplication/octet-stream; name=pinunpin-cas-7.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..b583049
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,180 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr);
  		}
  
  		/*
--- 161,181 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr, state);
  		}
  
  		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..458b9a8
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** ForgetPrivateRefCountEntry(PrivateRefCou
*** 427,432 ****
--- 427,464 ----
  	(GetPrivateRefCount(bufnum) > 0) \
  )
  
+ /*
+  * The following two macroses are aimed to simplify buffer state modification
+  * in CAS loop.  It's assumed that variable "uint32 state" is defined outside
+  * of this loop.  It should be used as fullowing:
+  *
+  * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+  * modifications of state variable;
+  * END_BUFSTATE_CAS_LOOP(bufHdr);
+  *
+  * For local buffers usage of these macros shouldn't be used.  Since there is
+  * no cuncurrency, local buffer state could be chaged directly by atomic
+  * read/write operations.
+  */
+ #define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+ 	do { \
+ 		SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(bufHdr)); \
+ 		uint32			oldstate; \
+ 		oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 		for (;;) { \
+ 			while (oldstate & BM_LOCKED) \
+ 			{ \
+ 				make_spin_delay(&delayStatus); \
+ 				oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 			} \
+ 			state = oldstate
+ 
+ #define END_BUFSTATE_CAS_LOOP(bufHdr) \
+ 			if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state)) \
+ 				break; \
+ 		} \
+ 		finish_spin_delay(&delayStatus); \
+ 	} while (0)
  
  static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
  				  ForkNumber forkNum, BlockNumber blockNum,
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 472,478 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** static int	buffertag_comparator(const vo
*** 457,463 ****
  static int	ckpt_buforder_comparator(const void *pa, const void *pb);
  static int	ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
  
- 
  /*
   * ComputeIoConcurrency -- get the number of pages to prefetch for a given
   *		number of spindles.
--- 489,494 ----
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 816,823 ****
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 847,856 ----
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 861,870 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 881,887 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 966,974 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 		state |= BM_VALID;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 1022,1032 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1059,1070 ****
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
--- 1095,1106 ----
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1106,1116 ****
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
  
  					/* Read the LSN while holding buffer header lock */
! 					LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
--- 1142,1153 ----
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
+ 					uint32		state;
  
  					/* Read the LSN while holding buffer header lock */
! 					state = LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf, state);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1291,1297 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1272 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
--- 1299,1309 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf, state);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1321,1335 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	UnlockBufHdr(buf, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1349 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
--- 1376,1390 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
*************** retry:
*** 1362,1373 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
--- 1403,1414 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
*************** retry:
*** 1381,1389 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
--- 1422,1430 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1437,1446 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** retry:
*** 1432,1438 ****
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
--- 1471,1480 ----
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
! 	uint32		state;
! 	bool		wasDirty;
! 
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1443,1472 ****
  		return;
  	}
  
- 	bufHdr = GetBufferDescriptor(buffer - 1);
- 
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
- 
- 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- 
- 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1485,1514 ----
  		return;
  	}
  
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 	wasDirty = (state & BM_DIRTY) ? true : false;
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	if (state == oldstate)
! 		break;
! 
! 	END_BUFSTATE_CAS_LOOP(bufHdr);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!wasDirty)
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1531,1536 ****
--- 1573,1583 ----
   *
   * This should be applied only to shared buffers, never local ones.
   *
+  * Since buffers are pinned/unpinned very frequently, this functions tries
+  * to pin buffer as cheap as possible.  This is why we don't take buffer header
+  * lock here, but update state variable in loop of CAS operations. Hopefully.
+  * it would be just single CAS.
+  *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
! 		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
! 		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1594,1616 ----
  
  	if (ref == NULL)
  	{
+ 		/* loop of CAS operations */
+ 		uint32			state;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 
! 		/* increase refcount */
! 		state += BUF_REFCOUNT_ONE;
! 
! 		/* increase usagecount unless already max */
! 		if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 			state += BUF_USAGECOUNT_ONE;
! 
! 		END_BUFSTATE_CAS_LOOP(buf);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1608 ****
--- 1650,1656 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1610,1617 ****
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1658,1671 ----
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	/*
! 	 * Since we assume to held buffer spinlock, we can update the buffer
! 	 * state in a single write operation.
! 	 */
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	state += BUF_REFCOUNT_ONE;
! 	UnlockBufHdr(buf, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1700,1740 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Since buffer spinlock holder can update status using just write,
! 		 * it's not safe to use atomic decrement here.  We are doing loop of
! 		 * CAS operations like PinBuffer does.
! 		 */
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 		state -= BUF_REFCOUNT_ONE;
! 		END_BUFSTATE_CAS_LOOP(buf);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				state &= ~BM_PIN_COUNT_WAITER;
+ 				UnlockBufHdr(buf, state);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf, state);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1752,1758 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1736,1748 ****
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1802,1814 ----
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1818,1824 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1954,1960 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2324,2330 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2291 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
--- 2338,2361 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2509,2515 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2527,2539 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2645,2651 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2671,2677 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2680,2687 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2700,2706 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2716,2727 ****
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header flags while we're doing this, we assume that
! 	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
! 	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2789,2800 ----
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header state while we're doing this, changing of
! 	 * state is atomic, so we'll read the old value or the new value, but not
! 	 * random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2736,2741 ****
--- 2809,2815 ----
  	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
  	char	   *page = BufferGetPage(buffer);
  	XLogRecPtr	lsn;
+ 	uint32		state;
  
  	/*
  	 * If we don't need locking for correctness, fastpath out.
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2747,2755 ****
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr);
  
  	return lsn;
  }
--- 2821,2829 ----
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	state = LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr, state);
  
  	return lsn;
  }
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2797,2802 ****
--- 2871,2877 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * We can make this a tad faster by prechecking the buffer tag before
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2817,2829 ****
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 2892,2904 ----
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2887,2892 ****
--- 2962,2968 ----
  	{
  		RelFileNode *rnode = NULL;
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2917,2927 ****
  		if (rnode == NULL)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  
  	pfree(nodes);
--- 2993,3003 ----
  		if (rnode == NULL)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  
  	pfree(nodes);
*************** DropDatabaseBuffers(Oid dbid)
*** 2951,2956 ****
--- 3027,3033 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropDatabaseBuffers(Oid dbid)
*** 2959,2969 ****
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3036,3046 ----
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushRelationBuffers(Relation rel)
*** 3055,3063 ****
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3132,3143 ----
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
+ 			uint32	state;
+ 
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				((state = pg_atomic_read_u32(&bufHdr->state)) &
! 					(BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3158,3165 ----
  						  localpage,
  						  false);
  
! 				state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3174,3181 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3187,3195 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushRelationBuffers(Relation rel)
*** 3115,3121 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3198,3204 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3228,3234 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3240,3248 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3167,3173 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3251,3257 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3297,3308 ****
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3381,3393 ----
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3398,3404 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3437,3447 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3461,3469 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		UnlockBufHdr(bufHdr, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3494,3512 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3599,3628 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr, state);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr, state);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3651,3661 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3696,3721 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3642 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
--- 3723,3741 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr, state);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr, state);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
! 		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3765,3781 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
! 		UnlockBufHdr(buf, state);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3803,3810 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3815,3823 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3725,3748 ****
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3826,3848 ----
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3868,3889 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3906,3912 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3834 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
! 			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
! 			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
--- 3915,3936 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
! 			UnlockBufHdr(buf, state);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
! 			UnlockBufHdr(buf, state);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4014,4034 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+ 	uint32			state;
+ 
+ 	BEGIN_BUFSTATE_CAS_LOOP(desc);
+ 	state |= BM_LOCKED;
+ 	END_BUFSTATE_CAS_LOOP(desc);
+ 
+ 	return state;
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..5ee8ff3
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,292 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
! 			UnlockBufHdr(buf);
  
  		}
  	}
--- 282,297 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
! 			UnlockBufHdr(buf, state);
  
  		}
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,628 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
--- 628,642 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..97ad67d
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
  	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
! 
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,139 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
  	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 144,167 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus = init_spin_delay((Pointer)lock);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..e6099b5
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,161 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
!  *
!  * It's assumed that nobody changes the state field while buffer header lock
!  * is held.  Thanks to it, buffer header lock holder can do complex updates of
!  * state variable in single write simultaneously with lock release (cleaning
!  * BM_LOCKED flag).  On the other hand, updating of state without holding
!  * buffer header lock is restricted to CAS which insure that BM_LOCKED flag
!  * is not set.  Atomic increment/decrement, OR/AND etc are not allowed.
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,153 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 173,183 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 232,246 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers!
   */
! extern uint32 LockBufHdr(BufferDesc *desc);
! #define UnlockBufHdr(desc, s)	\
! 	do {	\
! 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
! 		pg_write_barrier(); \
! 	} while (0)
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 301,308 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..2f96740
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1013 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ #define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+ 
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#91Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#89)
Re: Move PinBuffer and UnpinBuffer to atomics

Andres Freund <andres@anarazel.de> writes:

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

AFAICS, lwarx/stwcx are specifically *word* wide.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#92Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#91)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-29 13:24:40 -0400, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

AFAICS, lwarx/stwcx are specifically *word* wide.

Ah, x86 gcc is just too convenient, with it automatically adjusting
instruction types :(

There's actually lbarx/stbcx - but it's not present in all ISAs. So I
guess it's clear where to go.

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#93Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#90)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-29 20:22:00 +0300, Alexander Korotkov wrote:

+             while (true)
{
-                     if (buf->usage_count == 0)
-                             buf->usage_count = 1;
+                     /* spin-wait till lock is free */
+                     while (state & BM_LOCKED)
+                     {
+                             make_spin_delay(&delayStatus);
+                             state = pg_atomic_read_u32(&buf->state);
+                             oldstate = state;
+                     }

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).

I have some doubts about such function. At first, I can't find a place for
it in lwlock.c. It doesn't run explicit spin delays yet. Is it related to
some changes you're planning for lwlock.c.

Yes, see /messages/by-id/20160328130904.4mhugvkf4f3wg4qb@awork2.anarazel.de

lsn = XLogSaveBufferForHint(buffer, buffer_std);
}

-             LockBufHdr(bufHdr);
-             Assert(bufHdr->refcount > 0);
-             if (!(bufHdr->flags & BM_DIRTY))
+             state = LockBufHdr(bufHdr);
+
+             Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+
+             if (!(state & BM_DIRTY))
{
dirtied = true;         /* Means "will be dirtied

by this action" */

It's again worthwhile to try make this work without taking the lock.

Comment there claims.

/*
* Set the page LSN if we wrote a backup block. We aren't supposed
* to set this when only holding a share lock but as long as we
* serialise it somehow we're OK. We choose to set LSN while
* holding the buffer header lock, which causes any reader of an
* LSN who holds only a share lock to also obtain a buffer header
* lock before using PageGetLSN(), which is enforced in
* BufferGetLSNAtomic().

Thus, buffer header lock is used for write serialization. I don't think it
would be correct to remove the lock here.

Gah, I forgot about this uglyness. Man, the checksumming commit sure
made a number of things really ugly.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#94Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#92)
Re: Move PinBuffer and UnpinBuffer to atomics

Andres Freund <andres@anarazel.de> writes:

On 2016-03-29 13:24:40 -0400, Tom Lane wrote:

AFAICS, lwarx/stwcx are specifically *word* wide.

There's actually lbarx/stbcx - but it's not present in all ISAs. So I
guess it's clear where to go.

Hm. We could certainly add a configure test to see if the local assembler
knows these instructions --- but it's not clear that we should depend on
compile-time environment to match run-time.

Googling suggests that these instructions came in with PPC ISA 2.06
which seems to date to 2010. So there's undoubtedly still production
hardware without them.

In the department of not-production hardware, I checked this on prairiedog
and got
/var/tmp/ccbQy9uG.s:1722:Invalid mnemonic 'lbarx'
/var/tmp/ccbQy9uG.s:1726:Invalid mnemonic 'stbcx.'

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#95Andres Freund
andres@anarazel.de
In reply to: Tom Lane (#94)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-29 14:09:42 -0400, Tom Lane wrote:

Andres Freund <andres@anarazel.de> writes:

There's actually lbarx/stbcx - but it's not present in all ISAs. So I
guess it's clear where to go.

Hm. We could certainly add a configure test to see if the local assembler
knows these instructions --- but it's not clear that we should depend on
compile-time environment to match run-time.

I think it'd be easier to continue using lwarx/stwcx, but be careful
about only testing/setting the lowest byte, if we want to go there. But
that then likely would require hints about alignment to the compiler...

i've no experience writing ppc assembly, but it doesn't look too hard.

but i think it's easier to just remove the spinlock from struct lwlock
then - and it also improves the situation for other architectures with
wider spinlocks. i think that's beneficial for some future things
anyway.

Googling suggests that these instructions came in with PPC ISA 2.06
which seems to date to 2010. So there's undoubtedly still production
hardware without them.

In the department of not-production hardware, I checked this on prairiedog
and got
/var/tmp/ccbQy9uG.s:1722:Invalid mnemonic 'lbarx'
/var/tmp/ccbQy9uG.s:1726:Invalid mnemonic 'stbcx.'

Heh. Thanks for testing.

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#96Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#89)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 29, 2016 at 10:43 PM, Andres Freund <andres@anarazel.de> wrote:

My gut feeling is that we should do both 1) and 2).

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

I set that, but after that it hangs, even Initdb hangs..

int

│164 s_lock(volatile slock_t *lock, const char *file, int line)

│165 {

│166 SpinDelayStatus delayStatus;

│167

│168 init_spin_delay(&delayStatus, (Pointer)lock, file,
line);

│169

* │170 while (TAS_SPIN(lock))

*
* │171 {

*
* >│172 make_spin_delay(&delayStatus);

*
* │173 } *

│174

I did not try to find the reason, but just built in debug mode and found it
never come out of this loop.

I clean build multiple times but problem persist,

Does it have dependency of some other variable of defined under PPC in some
other place ? I don't know..

/* PowerPC */

*#if* defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) ||
defined(__powerpc64__)

*#define* HAS_TEST_AND_SET

*typedef* *unsigned* *int* slock_t; --> changed like this

*#define* TAS(lock) tas(lock)

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#97Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#96)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-30 07:13:16 +0530, Dilip Kumar wrote:

On Tue, Mar 29, 2016 at 10:43 PM, Andres Freund <andres@anarazel.de> wrote:

My gut feeling is that we should do both 1) and 2).

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

I set that, but after that it hangs, even Initdb hangs..

Yea, as Tom pointed out that's not going to work. I'll try to write a
patch for approach 1).

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#98Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#97)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Mar 30, 2016 at 10:16 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-30 07:13:16 +0530, Dilip Kumar wrote:

On Tue, Mar 29, 2016 at 10:43 PM, Andres Freund <andres@anarazel.de>

wrote:

My gut feeling is that we should do both 1) and 2).

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

I set that, but after that it hangs, even Initdb hangs..

Yea, as Tom pointed out that's not going to work. I'll try to write a
patch for approach 1).

Great! Do you need any improvements for pinunpin-cas-7.patch from me?

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#99Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#97)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Mar 30, 2016 at 3:16 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-30 07:13:16 +0530, Dilip Kumar wrote:

On Tue, Mar 29, 2016 at 10:43 PM, Andres Freund <andres@anarazel.de> wrote:

My gut feeling is that we should do both 1) and 2).

Dilip, could you test performance of reducing ppc's spinlock to 1 byte?
Cross-compiling suggest that doing so "just works". I.e. replace the
#if defined(__ppc__) typedef from an int to a char.

I set that, but after that it hangs, even Initdb hangs..

Yea, as Tom pointed out that's not going to work. I'll try to write a
patch for approach 1).

Does this mean that any platform that wants to perform well will now
need a sub-4-byte spinlock implementation? That's has a somewhat
uncomfortable sound to it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#100Andres Freund
andres@anarazel.de
In reply to: Robert Haas (#99)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-31 06:54:02 -0400, Robert Haas wrote:

On Wed, Mar 30, 2016 at 3:16 AM, Andres Freund <andres@anarazel.de> wrote:

Yea, as Tom pointed out that's not going to work. I'll try to write a
patch for approach 1).

Does this mean that any platform that wants to perform well will now
need a sub-4-byte spinlock implementation? That's has a somewhat
uncomfortable sound to it.

Oh. I confused my approaches. I was thinking about going for 2):

2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.

precisely because of that concern.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#101Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#100)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-31 12:58:55 +0200, Andres Freund wrote:

On 2016-03-31 06:54:02 -0400, Robert Haas wrote:

On Wed, Mar 30, 2016 at 3:16 AM, Andres Freund <andres@anarazel.de> wrote:

Yea, as Tom pointed out that's not going to work. I'll try to write a
patch for approach 1).

Does this mean that any platform that wants to perform well will now
need a sub-4-byte spinlock implementation? That's has a somewhat
uncomfortable sound to it.

Oh. I confused my approaches. I was thinking about going for 2):

2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.

precisely because of that concern.

Here's a WIP patch to evaluate. Dilip/Ashutosh, could you perhaps run
some benchmarks, to see whether this addresses the performance issues?

I guess it'd both be interesting to compare master with master + patch,
and this thread's latest patch with the patch additionally applied.

Thanks!

Andres

Attachments:

0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect-.patchtext/x-patch; charset=us-asciiDownload
From 623581574409bdf5cbfbba005bb2e961cb689573 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 31 Mar 2016 14:14:20 +0200
Subject: [PATCH 1/4] WIP: Avoid the use of a separate spinlock to protect
 LWLock's wait queue.

Previously we used a spinlock, in adition to the atomically manipulated
->state field, to protect the wait queue. But it's pretty simple to
instead perform the locking using a flag in state.

This tries to address a performance regression on PPC due to
6150a1b0. Our PPC spinlocks are 4 byte each, which makes BufferDesc
exceed 64bytes, causing cacheline-sharing issues.x

Discussion: CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com
    20160327121858.zrmrjegmji2ymnvr@alap3.anarazel.de
---
 src/backend/storage/lmgr/lwlock.c | 178 ++++++++++++++++++++------------------
 src/include/storage/lwlock.h      |   1 -
 2 files changed, 94 insertions(+), 85 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 53ae7d5..ec6baf6 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -97,6 +97,7 @@ extern slock_t *ShmemLock;
 
 #define LW_FLAG_HAS_WAITERS			((uint32) 1 << 30)
 #define LW_FLAG_RELEASE_OK			((uint32) 1 << 29)
+#define LW_FLAG_LOCKED				((uint32) 1 << 28)
 
 #define LW_VAL_EXCLUSIVE			((uint32) 1 << 24)
 #define LW_VAL_SHARED				1
@@ -711,7 +712,6 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
 void
 LWLockInitialize(LWLock *lock, int tranche_id)
 {
-	SpinLockInit(&lock->mutex);
 	pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
 #ifdef LOCK_DEBUG
 	pg_atomic_init_u32(&lock->nwaiters, 0);
@@ -842,6 +842,56 @@ LWLockAttemptLock(LWLock *lock, LWLockMode mode)
 	pg_unreachable();
 }
 
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+	uint32 old_state;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+	uint32 delays = 0;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	old_state = pg_atomic_read_u32(&lock->state);
+	while (true)
+	{
+		if (old_state & LW_FLAG_LOCKED)
+		{
+			/* FIXME: add exponential backoff */
+			pg_spin_delay();
+			old_state = pg_atomic_read_u32(&lock->state);
+#ifdef LWLOCK_STATS
+			delays++;
+#endif
+		}
+		else
+		{
+			old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+			if (!(old_state & LW_FLAG_LOCKED))
+			{
+				/* got lock */
+				break;
+			}
+		}
+	}
+
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += delays;
+#endif
+
+}
+
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+	uint32 old_state PG_USED_FOR_ASSERTS_ONLY;
+
+	old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+	Assert(old_state & LW_FLAG_LOCKED);
+}
+
 /*
  * Wakeup all the lockers that currently have a chance to acquire the lock.
  */
@@ -852,22 +902,13 @@ LWLockWakeup(LWLock *lock)
 	bool		wokeup_somebody = false;
 	dlist_head	wakeup;
 	dlist_mutable_iter iter;
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-#endif
 
 	dlist_init(&wakeup);
 
 	new_release_ok = true;
 
 	/* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 
 	dlist_foreach_modify(iter, &lock->waiters)
 	{
@@ -904,19 +945,34 @@ LWLockWakeup(LWLock *lock)
 
 	Assert(dlist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
 
-	/* Unset both flags at once if required */
-	if (!new_release_ok && dlist_is_empty(&wakeup))
-		pg_atomic_fetch_and_u32(&lock->state,
-								~(LW_FLAG_RELEASE_OK | LW_FLAG_HAS_WAITERS));
-	else if (!new_release_ok)
-		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_RELEASE_OK);
-	else if (dlist_is_empty(&wakeup))
-		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
-	else if (new_release_ok)
-		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+	/* unset required flags, and release lock, in one fell swoop */
+	{
+		uint32 old_state;
+		uint32 desired_state;
 
-	/* We are done updating the shared state of the lock queue. */
-	SpinLockRelease(&lock->mutex);
+		old_state = pg_atomic_read_u32(&lock->state);
+		while (true)
+		{
+			desired_state = old_state;
+
+			/* Unset both flags at once if required */
+			if (!new_release_ok && dlist_is_empty(&wakeup))
+				desired_state &= ~(LW_FLAG_RELEASE_OK | LW_FLAG_HAS_WAITERS);
+			else if (!new_release_ok)
+				desired_state &= ~LW_FLAG_RELEASE_OK;
+			else if (dlist_is_empty(&wakeup))
+				desired_state &= ~LW_FLAG_HAS_WAITERS;
+			else if (new_release_ok)
+				desired_state |= LW_FLAG_RELEASE_OK;
+
+			/* release lock */
+			desired_state &= ~LW_FLAG_LOCKED;
+
+			if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+											   desired_state))
+				break;
+		}
+	}
 
 	/* Awaken any waiters I removed from the queue. */
 	dlist_foreach_modify(iter, &wakeup)
@@ -933,7 +989,7 @@ LWLockWakeup(LWLock *lock)
 		 * that happens before the list unlink happens, the list would end up
 		 * being corrupted.
 		 *
-		 * The barrier pairs with the SpinLockAcquire() when enqueing for
+		 * The barrier pairs with the LWLockWaitListLock() when enqueing for
 		 * another lock.
 		 */
 		pg_write_barrier();
@@ -950,12 +1006,6 @@ LWLockWakeup(LWLock *lock)
 static void
 LWLockQueueSelf(LWLock *lock, LWLockMode mode)
 {
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-#endif
-
 	/*
 	 * If we don't have a PGPROC structure, there's no way to wait. This
 	 * should never occur, since MyProc should only be null during shared
@@ -967,11 +1017,7 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
 	if (MyProc->lwWaiting)
 		elog(PANIC, "queueing for lock while waiting on another one");
 
-#ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 
 	/* setting the flag is protected by the spinlock */
 	pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
@@ -986,7 +1032,7 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
 		dlist_push_tail(&lock->waiters, &MyProc->lwWaitLink);
 
 	/* Can release the mutex now */
-	SpinLockRelease(&lock->mutex);
+	LWLockWaitListUnlock(lock);
 
 #ifdef LOCK_DEBUG
 	pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
@@ -1007,19 +1053,7 @@ LWLockDequeueSelf(LWLock *lock)
 	bool		found = false;
 	dlist_mutable_iter iter;
 
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-
-	lwstats->dequeue_self_count++;
-#endif
-
-#ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 
 	/*
 	 * Can't just remove ourselves from the list, but we need to iterate over
@@ -1043,7 +1077,8 @@ LWLockDequeueSelf(LWLock *lock)
 		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
 	}
 
-	SpinLockRelease(&lock->mutex);
+	/* XXX: combine with fetch_and above? */
+	LWLockWaitListUnlock(lock);
 
 	/* clear waiting state again, nice for debugging */
 	if (found)
@@ -1460,11 +1495,6 @@ LWLockConflictsWithVar(LWLock *lock,
 {
 	bool		mustwait;
 	uint64		value;
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-#endif
 
 	/*
 	 * Test first to see if it the slot is free right now.
@@ -1484,17 +1514,13 @@ LWLockConflictsWithVar(LWLock *lock,
 	*result = false;
 
 	/*
-	 * Read value using spinlock as we can't rely on atomic 64 bit
-	 * reads/stores.  TODO: On platforms with a way to do atomic 64 bit
-	 * reads/writes the spinlock could be optimized away.
+	 * Read value using the lwlock's internal lock, as we can't generally rely
+	 * on atomic 64 bit reads/stores.  TODO: On platforms with a way to do
+	 * atomic 64 bit reads/writes the spinlock could be optimized away.
 	 */
-#ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 	value = *valptr;
-	SpinLockRelease(&lock->mutex);
+	LWLockWaitListUnlock(lock);
 
 	if (value != oldval)
 	{
@@ -1668,22 +1694,13 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 {
 	dlist_head	wakeup;
 	dlist_mutable_iter iter;
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-#endif
 
 	PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
 
 	dlist_init(&wakeup);
 
 	/* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 
 	Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
 
@@ -1706,7 +1723,7 @@ LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
 	}
 
 	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
+	LWLockWaitListUnlock(lock);
 
 	/*
 	 * Awaken any waiters I removed from the queue.
@@ -1804,21 +1821,14 @@ LWLockRelease(LWLock *lock)
 void
 LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
 {
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-	lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
-#else
-	SpinLockAcquire(&lock->mutex);
-#endif
+	LWLockWaitListLock(lock);
 	/*
 	 * Set the variable's value before releasing the lock, that prevents race
 	 * a race condition wherein a new locker acquires the lock, but hasn't yet
 	 * set the variables value.
 	 */
 	*valptr = val;
-	SpinLockRelease(&lock->mutex);
+	LWLockWaitListUnlock(lock);
 
 	LWLockRelease(lock);
 }
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 5e6299a..f5b94e0 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -57,7 +57,6 @@ typedef struct LWLockTranche
  */
 typedef struct LWLock
 {
-	slock_t		mutex;			/* Protects LWLock and queue of PGPROCs */
 	uint16		tranche;		/* tranche ID */
 
 	pg_atomic_uint32 state;		/* state of exclusive/nonexclusive lockers */
-- 
2.7.0.229.g701fa7f.dirty

#102Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#90)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Mar 29, 2016 at 10:52 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Hi, Andres!

Please, find next revision of patch in attachment.

Couple of minor comments:

+ * The following two macroses

is macroses right word to be used here?

+ * of this loop. It should be used as fullowing:

/fullowing/following

+ * For local buffers usage of these macros shouldn't be used.

isn't it better to write it as

For local buffers, these macros shouldn't be used.

static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);

-

Spurious line deletion.

+  * Since buffers are pinned/unpinned very frequently, this functions tries
+  * to pin buffer as cheap as possible.

/this functions tries

which functions are you referring here? Comment seems to be slightly
unclear.

! if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) &
BM_PERMANENT))

Is there a reason that you have kept macro's to read refcount and
usagecount, but not for flags?

Apart from this, I have verified that patch compiles on Windows and passed
regressions (make check)!

Nice work!

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#103Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andres Freund (#100)
Re: Move PinBuffer and UnpinBuffer to atomics

Andres Freund <andres@anarazel.de> writes:

Oh. I confused my approaches. I was thinking about going for 2):

2) Replace the lwlock spinlock by a bit in LWLock->state. That'd avoid
embedding the spinlock, and actually might allow to avoid one atomic
op in a number of cases.

precisely because of that concern.

Oh, okay, ignore my comment just now in the other thread.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#104Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#102)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi!

On Thu, Mar 31, 2016 at 4:59 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Tue, Mar 29, 2016 at 10:52 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

Hi, Andres!

Please, find next revision of patch in attachment.

Couple of minor comments:

+ * The following two macroses

is macroses right word to be used here?

+ * of this loop. It should be used as fullowing:

/fullowing/following

+ * For local buffers usage of these macros shouldn't be used.

isn't it better to write it as

For local buffers, these macros shouldn't be used.

static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);

-

Spurious line deletion.

All of above is fixed.

+ * Since buffers are pinned/unpinned very frequently, this functions tries

+ * to pin buffer as cheap as possible.

/this functions tries

which functions are you referring here? Comment seems to be slightly
unclear.

I meant just PinBuffer() there. UnpinBuffer() has another comment in the
body. Fixed.

! if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) &
BM_PERMANENT))

Is there a reason that you have kept macro's to read refcount and
usagecount, but not for flags?

We could change dealing with flags to GET/SET macros. But I guess such
change would make review more complicated, because it would touch some
places which are unchanged for now. I think it could be done in a separate
refactoring patch.

Apart from this, I have verified that patch compiles on Windows and passed

regressions (make check)!

Thank you! I didn't manage to try this on Windows.

Nice work!

Thank you!

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-8.patchapplication/octet-stream; name=pinunpin-cas-8.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..b583049
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,180 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr);
  		}
  
  		/*
--- 161,181 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr, state);
  		}
  
  		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..c954fbc
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** ForgetPrivateRefCountEntry(PrivateRefCou
*** 427,432 ****
--- 427,464 ----
  	(GetPrivateRefCount(bufnum) > 0) \
  )
  
+ /*
+  * The following two macros are aimed to simplify buffer state modification
+  * in CAS loop.  It's assumed that variable "uint32 state" is defined outside
+  * of this loop.  It should be used as following:
+  *
+  * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+  * modifications of state variable;
+  * END_BUFSTATE_CAS_LOOP(bufHdr);
+  *
+  * For local buffers, these macros shouldn't be used..  Since there is
+  * no cuncurrency, local buffer state could be chaged directly by atomic
+  * read/write operations.
+  */
+ #define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+ 	do { \
+ 		SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(bufHdr)); \
+ 		uint32			oldstate; \
+ 		oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 		for (;;) { \
+ 			while (oldstate & BM_LOCKED) \
+ 			{ \
+ 				make_spin_delay(&delayStatus); \
+ 				oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 			} \
+ 			state = oldstate
+ 
+ #define END_BUFSTATE_CAS_LOOP(bufHdr) \
+ 			if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state)) \
+ 				break; \
+ 		} \
+ 		finish_spin_delay(&delayStatus); \
+ 	} while (0)
  
  static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
  				  ForkNumber forkNum, BlockNumber blockNum,
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 472,478 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 816,823 ****
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 848,857 ----
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 862,871 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 882,888 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 967,975 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 		state |= BM_VALID;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 1023,1033 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1059,1070 ****
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
--- 1096,1107 ----
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1106,1116 ****
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
  
  					/* Read the LSN while holding buffer header lock */
! 					LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
--- 1143,1154 ----
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
+ 					uint32		state;
  
  					/* Read the LSN while holding buffer header lock */
! 					state = LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf, state);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1292,1298 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1272 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
--- 1300,1310 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf, state);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1322,1336 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	UnlockBufHdr(buf, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1349 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
--- 1377,1391 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
*************** retry:
*** 1362,1373 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
--- 1404,1415 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
*************** retry:
*** 1381,1389 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
--- 1423,1431 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1438,1447 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** retry:
*** 1432,1438 ****
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
--- 1472,1481 ----
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
! 	uint32		state;
! 	bool		wasDirty;
! 
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1443,1472 ****
  		return;
  	}
  
- 	bufHdr = GetBufferDescriptor(buffer - 1);
- 
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
- 
- 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- 
- 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1486,1515 ----
  		return;
  	}
  
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 	wasDirty = (state & BM_DIRTY) ? true : false;
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	if (state == oldstate)
! 		break;
! 
! 	END_BUFSTATE_CAS_LOOP(bufHdr);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!wasDirty)
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1531,1536 ****
--- 1574,1584 ----
   *
   * This should be applied only to shared buffers, never local ones.
   *
+  * Since buffers are pinned/unpinned very frequently, this function tries
+  * to pin buffer as cheap as possible.  This is why we don't take buffer header
+  * lock here, but update state variable in loop of CAS operations. Hopefully.
+  * it would be just single CAS.
+  *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
! 		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
! 		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1595,1617 ----
  
  	if (ref == NULL)
  	{
+ 		/* loop of CAS operations */
+ 		uint32			state;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 
! 		/* increase refcount */
! 		state += BUF_REFCOUNT_ONE;
! 
! 		/* increase usagecount unless already max */
! 		if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 			state += BUF_USAGECOUNT_ONE;
! 
! 		END_BUFSTATE_CAS_LOOP(buf);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1608 ****
--- 1651,1657 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1610,1617 ****
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1659,1672 ----
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	/*
! 	 * Since we assume to held buffer spinlock, we can update the buffer
! 	 * state in a single write operation.
! 	 */
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	state += BUF_REFCOUNT_ONE;
! 	UnlockBufHdr(buf, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1701,1741 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Since buffer spinlock holder can update status using just write,
! 		 * it's not safe to use atomic decrement here.  We are doing loop of
! 		 * CAS operations like PinBuffer does.
! 		 */
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 		state -= BUF_REFCOUNT_ONE;
! 		END_BUFSTATE_CAS_LOOP(buf);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				state &= ~BM_PIN_COUNT_WAITER;
+ 				UnlockBufHdr(buf, state);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf, state);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1753,1759 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1736,1748 ****
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1803,1815 ----
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1819,1825 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1955,1961 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2325,2331 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2291 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
--- 2339,2362 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2510,2516 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2528,2540 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2646,2652 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2672,2678 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2681,2688 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2701,2707 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2716,2727 ****
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header flags while we're doing this, we assume that
! 	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
! 	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2790,2801 ----
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header state while we're doing this, changing of
! 	 * state is atomic, so we'll read the old value or the new value, but not
! 	 * random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2736,2741 ****
--- 2810,2816 ----
  	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
  	char	   *page = BufferGetPage(buffer);
  	XLogRecPtr	lsn;
+ 	uint32		state;
  
  	/*
  	 * If we don't need locking for correctness, fastpath out.
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2747,2755 ****
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr);
  
  	return lsn;
  }
--- 2822,2830 ----
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	state = LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr, state);
  
  	return lsn;
  }
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2797,2802 ****
--- 2872,2878 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * We can make this a tad faster by prechecking the buffer tag before
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2817,2829 ****
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 2893,2905 ----
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2887,2892 ****
--- 2963,2969 ----
  	{
  		RelFileNode *rnode = NULL;
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2917,2927 ****
  		if (rnode == NULL)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  
  	pfree(nodes);
--- 2994,3004 ----
  		if (rnode == NULL)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  
  	pfree(nodes);
*************** DropDatabaseBuffers(Oid dbid)
*** 2951,2956 ****
--- 3028,3034 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropDatabaseBuffers(Oid dbid)
*** 2959,2969 ****
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3037,3047 ----
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushRelationBuffers(Relation rel)
*** 3055,3063 ****
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3133,3144 ----
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
+ 			uint32	state;
+ 
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				((state = pg_atomic_read_u32(&bufHdr->state)) &
! 					(BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3159,3166 ----
  						  localpage,
  						  false);
  
! 				state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3175,3182 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3188,3196 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushRelationBuffers(Relation rel)
*** 3115,3121 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3199,3205 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3229,3235 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3241,3249 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3167,3173 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3252,3258 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3297,3308 ****
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3382,3394 ----
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3399,3405 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3438,3448 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3462,3470 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		UnlockBufHdr(bufHdr, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3495,3513 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3600,3629 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr, state);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr, state);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3652,3662 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3697,3722 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3642 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
--- 3724,3742 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr, state);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr, state);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
! 		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3766,3782 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
! 		UnlockBufHdr(buf, state);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3804,3811 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3816,3824 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3725,3748 ****
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3827,3849 ----
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3869,3890 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3907,3913 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3834 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
! 			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
! 			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
--- 3916,3937 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
! 			UnlockBufHdr(buf, state);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
! 			UnlockBufHdr(buf, state);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4015,4035 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+ 	uint32			state;
+ 
+ 	BEGIN_BUFSTATE_CAS_LOOP(desc);
+ 	state |= BM_LOCKED;
+ 	END_BUFSTATE_CAS_LOOP(desc);
+ 
+ 	return state;
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..5ee8ff3
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,292 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
! 			UnlockBufHdr(buf);
  
  		}
  	}
--- 282,297 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
! 			UnlockBufHdr(buf, state);
  
  		}
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,628 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
--- 628,642 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..97ad67d
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
  	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
! 
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,139 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! make_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
  	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 144,167 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus = init_spin_delay((Pointer)lock);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		make_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..e6099b5
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,161 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
!  *
!  * It's assumed that nobody changes the state field while buffer header lock
!  * is held.  Thanks to it, buffer header lock holder can do complex updates of
!  * state variable in single write simultaneously with lock release (cleaning
!  * BM_LOCKED flag).  On the other hand, updating of state without holding
!  * buffer header lock is restricted to CAS which insure that BM_LOCKED flag
!  * is not set.  Atomic increment/decrement, OR/AND etc are not allowed.
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,153 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
--- 173,183 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 232,246 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers!
   */
! extern uint32 LockBufHdr(BufferDesc *desc);
! #define UnlockBufHdr(desc, s)	\
! 	do {	\
! 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
! 		pg_write_barrier(); \
! 	} while (0)
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 301,308 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..2f96740
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1013 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ #define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+ 
+ void make_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#105Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#104)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

+/*
+ * The following two macros are aimed to simplify buffer state modification
+ * in CAS loop.  It's assumed that variable "uint32 state" is defined outside
+ * of this loop.  It should be used as following:
+ *
+ * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+ * modifications of state variable;
+ * END_BUFSTATE_CAS_LOOP(bufHdr);
+ *
+ * For local buffers, these macros shouldn't be used..  Since there is
+ * no cuncurrency, local buffer state could be chaged directly by atomic
+ * read/write operations.
+ */
+#define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+	do { \
+		SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(bufHdr)); \
+		uint32			oldstate; \
+		oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+		for (;;) { \
+			while (oldstate & BM_LOCKED) \
+			{ \
+				make_spin_delay(&delayStatus); \
+				oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+			} \
+			state = oldstate
+
+#define END_BUFSTATE_CAS_LOOP(bufHdr) \
+			if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state)) \
+				break; \
+		} \
+		finish_spin_delay(&delayStatus); \
+	} while (0)

Hm. Not sure if that's not too much magic. Will think about it.

/*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+ 	uint32			state;
+ 
+ 	BEGIN_BUFSTATE_CAS_LOOP(desc);
+ 	state |= BM_LOCKED;
+ 	END_BUFSTATE_CAS_LOOP(desc);
+ 
+ 	return state;
+ }
+ 

Hm. It seems a bit over the top to do the full round here. How about

uint32 oldstate;
while (true)
{
oldstate = pg_atomic_fetch_or(..., BM_LOCKED);
if (!(oldstate & BM_LOCKED)
break;
perform_spin_delay();
}
return oldstate | BM_LOCKED;

Especially if we implement atomic xor on x86, that'd likely be a good
bit more efficient.

typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
- BufFlags flags; /* see bit definitions above */
- uint8 usage_count; /* usage counter for clock sweep code */
- slock_t buf_hdr_lock; /* protects a subset of fields, see above */
- unsigned refcount; /* # of backends holding pins on buffer */
- int wait_backend_pid; /* backend PID of pin-count waiter */

+	/* state of the tag, containing flags, refcount and usagecount */
+	pg_atomic_uint32 state;
+
+	int			wait_backend_pid;		/* backend PID of pin-count waiter */
int			buf_id;			/* buffer's index number (from 0) */
int			freeNext;		/* link in freelist chain */

Hm. Won't matter on most platforms, but it seems like a good idea to
move to an 8 byte aligned boundary. Move buf_id up?

+/*
+ * Support for spin delay which could be useful in other places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+	int			spins;
+	int			delays;
+	int			cur_delay;
+	Pointer		ptr;
+	const char *file;
+	int			line;
+} SpinDelayStatus;
+
+#define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+
+void make_spin_delay(SpinDelayStatus *status);
+void finish_spin_delay(SpinDelayStatus *status);
+
#endif	 /* S_LOCK_H */

s/make_spin_delay/perform_spin_delay/? The former sounds like it's
allocating something or such.

Regards,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#106Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#105)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Mar 31, 2016 at 7:14 PM, Andres Freund <andres@anarazel.de> wrote:

+/*
+ * The following two macros are aimed to simplify buffer state

modification

+ * in CAS loop. It's assumed that variable "uint32 state" is defined

outside

+ * of this loop.  It should be used as following:
+ *
+ * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+ * modifications of state variable;
+ * END_BUFSTATE_CAS_LOOP(bufHdr);
+ *
+ * For local buffers, these macros shouldn't be used..  Since there is
+ * no cuncurrency, local buffer state could be chaged directly by atomic
+ * read/write operations.
+ */
+#define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+     do { \
+             SpinDelayStatus delayStatus =

init_spin_delay((Pointer)(bufHdr)); \

+             uint32                  oldstate; \
+             oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+             for (;;) { \
+                     while (oldstate & BM_LOCKED) \
+                     { \
+                             make_spin_delay(&delayStatus); \
+                             oldstate =

pg_atomic_read_u32(&(bufHdr)->state); \

+                     } \
+                     state = oldstate
+
+#define END_BUFSTATE_CAS_LOOP(bufHdr) \
+                     if (pg_atomic_compare_exchange_u32(&bufHdr->state,

&oldstate, state)) \

+                             break; \
+             } \
+             finish_spin_delay(&delayStatus); \
+     } while (0)

Hm. Not sure if that's not too much magic. Will think about it.

I'm not sure too...

/*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+     uint32                  state;
+
+     BEGIN_BUFSTATE_CAS_LOOP(desc);
+     state |= BM_LOCKED;
+     END_BUFSTATE_CAS_LOOP(desc);
+
+     return state;
+ }
+

Hm. It seems a bit over the top to do the full round here. How about

uint32 oldstate;
while (true)
{
oldstate = pg_atomic_fetch_or(..., BM_LOCKED);
if (!(oldstate & BM_LOCKED)
break;
perform_spin_delay();
}
return oldstate | BM_LOCKED;

Especially if we implement atomic xor on x86, that'd likely be a good
bit more efficient.

Nice idea. Done.

typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in

buffer */

- BufFlags flags; /* see bit definitions

above */

- uint8 usage_count; /* usage counter for clock sweep

code */

- slock_t buf_hdr_lock; /* protects a subset of fields,

see above */

- unsigned refcount; /* # of backends holding

pins on buffer */

- int wait_backend_pid; /* backend

PID of pin-count waiter */

+     /* state of the tag, containing flags, refcount and usagecount */
+     pg_atomic_uint32 state;
+
+     int                     wait_backend_pid;               /* backend

PID of pin-count waiter */

int buf_id; /* buffer's index

number (from 0) */

int freeNext; /* link in

freelist chain */

Hm. Won't matter on most platforms, but it seems like a good idea to
move to an 8 byte aligned boundary. Move buf_id up?

I think so. Done.

+/*
+ * Support for spin delay which could be useful in other places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+     int                     spins;
+     int                     delays;
+     int                     cur_delay;
+     Pointer         ptr;
+     const char *file;
+     int                     line;
+} SpinDelayStatus;
+
+#define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+
+void make_spin_delay(SpinDelayStatus *status);
+void finish_spin_delay(SpinDelayStatus *status);
+
#endif        /* S_LOCK_H */

s/make_spin_delay/perform_spin_delay/? The former sounds like it's
allocating something or such.

Done.

I think these changes worth running benchmark again. I'm going to run it
on 4x18 Intel.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

pinunpin-cas-9.patchapplication/octet-stream; name=pinunpin-cas-9.patchDownload
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..b583049
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,180 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr);
  		}
  
  		/*
--- 161,181 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr, state);
  		}
  
  		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..4c85419
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** ForgetPrivateRefCountEntry(PrivateRefCou
*** 427,432 ****
--- 427,464 ----
  	(GetPrivateRefCount(bufnum) > 0) \
  )
  
+ /*
+  * The following two macros are aimed to simplify buffer state modification
+  * in CAS loop.  It's assumed that variable "uint32 state" is defined outside
+  * of this loop.  It should be used as following:
+  *
+  * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+  * modifications of state variable;
+  * END_BUFSTATE_CAS_LOOP(bufHdr);
+  *
+  * For local buffers, these macros shouldn't be used..  Since there is
+  * no cuncurrency, local buffer state could be chaged directly by atomic
+  * read/write operations.
+  */
+ #define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+ 	do { \
+ 		SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(bufHdr)); \
+ 		uint32			oldstate; \
+ 		oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 		for (;;) { \
+ 			while (oldstate & BM_LOCKED) \
+ 			{ \
+ 				perform_spin_delay(&delayStatus); \
+ 				oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 			} \
+ 			state = oldstate
+ 
+ #define END_BUFSTATE_CAS_LOOP(bufHdr) \
+ 			if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state)) \
+ 				break; \
+ 		} \
+ 		finish_spin_delay(&delayStatus); \
+ 	} while (0)
  
  static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
  				  ForkNumber forkNum, BlockNumber blockNum,
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 472,478 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 816,823 ****
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 848,857 ----
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 862,871 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 882,888 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 967,975 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 		state |= BM_VALID;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 1023,1033 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1059,1070 ****
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
--- 1096,1107 ----
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1106,1116 ****
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
  
  					/* Read the LSN while holding buffer header lock */
! 					LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
--- 1143,1154 ----
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
+ 					uint32		state;
  
  					/* Read the LSN while holding buffer header lock */
! 					state = LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf, state);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1292,1298 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1272 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
--- 1300,1310 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf, state);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1322,1336 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	UnlockBufHdr(buf, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1349 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
--- 1377,1391 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
*************** retry:
*** 1362,1373 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
--- 1404,1415 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
*************** retry:
*** 1381,1389 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
--- 1423,1431 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1438,1447 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** retry:
*** 1432,1438 ****
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
--- 1472,1481 ----
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
! 	uint32		state;
! 	bool		wasDirty;
! 
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1443,1472 ****
  		return;
  	}
  
- 	bufHdr = GetBufferDescriptor(buffer - 1);
- 
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
- 
- 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- 
- 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1486,1515 ----
  		return;
  	}
  
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 	wasDirty = (state & BM_DIRTY) ? true : false;
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	if (state == oldstate)
! 		break;
! 
! 	END_BUFSTATE_CAS_LOOP(bufHdr);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!wasDirty)
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1531,1536 ****
--- 1574,1584 ----
   *
   * This should be applied only to shared buffers, never local ones.
   *
+  * Since buffers are pinned/unpinned very frequently, this function tries
+  * to pin buffer as cheap as possible.  This is why we don't take buffer header
+  * lock here, but update state variable in loop of CAS operations. Hopefully.
+  * it would be just single CAS.
+  *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
! 		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
! 		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1595,1617 ----
  
  	if (ref == NULL)
  	{
+ 		/* loop of CAS operations */
+ 		uint32			state;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 
! 		/* increase refcount */
! 		state += BUF_REFCOUNT_ONE;
! 
! 		/* increase usagecount unless already max */
! 		if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 			state += BUF_USAGECOUNT_ONE;
! 
! 		END_BUFSTATE_CAS_LOOP(buf);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1608 ****
--- 1651,1657 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1610,1617 ****
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1659,1672 ----
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	/*
! 	 * Since we assume to held buffer spinlock, we can update the buffer
! 	 * state in a single write operation.
! 	 */
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	state += BUF_REFCOUNT_ONE;
! 	UnlockBufHdr(buf, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1701,1741 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Since buffer spinlock holder can update status using just write,
! 		 * it's not safe to use atomic decrement here.  We are doing loop of
! 		 * CAS operations like PinBuffer does.
! 		 */
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 		state -= BUF_REFCOUNT_ONE;
! 		END_BUFSTATE_CAS_LOOP(buf);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				state &= ~BM_PIN_COUNT_WAITER;
+ 				UnlockBufHdr(buf, state);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf, state);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1753,1759 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1736,1748 ****
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1803,1815 ----
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1819,1825 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1955,1961 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2325,2331 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2291 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
--- 2339,2362 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2510,2516 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2528,2540 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2646,2652 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2672,2678 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2681,2688 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2701,2707 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2716,2727 ****
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header flags while we're doing this, we assume that
! 	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
! 	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2790,2801 ----
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header state while we're doing this, changing of
! 	 * state is atomic, so we'll read the old value or the new value, but not
! 	 * random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2736,2741 ****
--- 2810,2816 ----
  	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
  	char	   *page = BufferGetPage(buffer);
  	XLogRecPtr	lsn;
+ 	uint32		state;
  
  	/*
  	 * If we don't need locking for correctness, fastpath out.
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2747,2755 ****
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr);
  
  	return lsn;
  }
--- 2822,2830 ----
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	state = LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr, state);
  
  	return lsn;
  }
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2797,2802 ****
--- 2872,2878 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * We can make this a tad faster by prechecking the buffer tag before
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2817,2829 ****
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 2893,2905 ----
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2887,2892 ****
--- 2963,2969 ----
  	{
  		RelFileNode *rnode = NULL;
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2917,2927 ****
  		if (rnode == NULL)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  
  	pfree(nodes);
--- 2994,3004 ----
  		if (rnode == NULL)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  
  	pfree(nodes);
*************** DropDatabaseBuffers(Oid dbid)
*** 2951,2956 ****
--- 3028,3034 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropDatabaseBuffers(Oid dbid)
*** 2959,2969 ****
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3037,3047 ----
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushRelationBuffers(Relation rel)
*** 3055,3063 ****
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3133,3144 ----
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
+ 			uint32	state;
+ 
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				((state = pg_atomic_read_u32(&bufHdr->state)) &
! 					(BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3159,3166 ----
  						  localpage,
  						  false);
  
! 				state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3175,3182 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3188,3196 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushRelationBuffers(Relation rel)
*** 3115,3121 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3199,3205 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3229,3235 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3241,3249 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3167,3173 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3252,3258 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3297,3308 ****
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3382,3394 ----
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3399,3405 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3438,3448 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3462,3470 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		UnlockBufHdr(bufHdr, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3495,3513 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3600,3629 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr, state);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr, state);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3652,3662 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3697,3722 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3642 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
--- 3724,3742 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr, state);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr, state);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
! 		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3766,3782 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
! 		UnlockBufHdr(buf, state);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3804,3811 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3816,3824 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3725,3748 ****
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3827,3849 ----
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3869,3890 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3907,3913 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3834 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
! 			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
! 			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
--- 3916,3937 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
! 			UnlockBufHdr(buf, state);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
! 			UnlockBufHdr(buf, state);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4015,4041 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+ 	SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(desc));
+ 	uint32			oldstate;
+ 	while (true)
+ 	{
+ 		/* set BM_LOCKED flag */
+ 		oldstate = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
+ 		/* if it wasn't set before we're OK */
+ 		if (!(oldstate & BM_LOCKED))
+ 			break;
+ 		perform_spin_delay(&delayStatus);
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return oldstate | BM_LOCKED;
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..5ee8ff3
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,292 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
! 			UnlockBufHdr(buf);
  
  		}
  	}
--- 282,297 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
! 			UnlockBufHdr(buf, state);
  
  		}
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,628 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
--- 628,642 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..5ebd4dc
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
  	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
! 
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,139 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! perform_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
  	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 144,167 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus = init_spin_delay((Pointer)lock);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		perform_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..e967b84
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,161 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
!  *
!  * It's assumed that nobody changes the state field while buffer header lock
!  * is held.  Thanks to it, buffer header lock holder can do complex updates of
!  * state variable in single write simultaneously with lock release (cleaning
!  * BM_LOCKED flag).  On the other hand, updating of state without holding
!  * buffer header lock is restricted to CAS which insure that BM_LOCKED flag
!  * is not set.  Atomic increment/decrement, OR/AND etc are not allowed.
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,154 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
- 
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
  	LWLock		content_lock;	/* to lock access to buffer contents */
--- 173,184 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  	int			buf_id;			/* buffer's index number (from 0) */
+ 
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			freeNext;		/* link in freelist chain */
  
  	LWLock		content_lock;	/* to lock access to buffer contents */
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 232,246 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers!
   */
! extern uint32 LockBufHdr(BufferDesc *desc);
! #define UnlockBufHdr(desc, s)	\
! 	do {	\
! 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
! 		pg_write_barrier(); \
! 	} while (0)
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 301,308 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..fd49abf
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1013 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ #define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+ 
+ void perform_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
#107Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#106)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Mar 31, 2016 at 8:21 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

I think these changes worth running benchmark again. I'm going to run it
on 4x18 Intel.

The results are following.

clients master v3 v5 v9
1 11671 12507 12679 12408
2 24650 26005 25010 25495
4 49631 48863 49811 50150
8 96790 96441 99946 98383
10 121275 119928 124100 124180
20 243066 243365 246432 248723
30 359616 342241 357310 378881
40 431375 415310 441619 425653
50 489991 489896 500590 502549
60 538057 636473 554069 685010
70 588659 714426 738535 719383
80 405008 923039 902632 909126
90 295443 1181247 1155918 1179163
100 258695 1323125 1325019 1351578
110 238842 1393767 1410274 1421531
120 226018 1432504 1474982 1497122
130 215102 1465459 1503241 1521619
140 206415 1470454 1505380 1541816
150 197850 1475479 1519908 1515017
160 190935 1420915 1484868 1523150
170 185835 1438965 1453128 1499913
180 182519 1416252 1453098 1472945

It appears that atomic OR for LockBufHdr() gives small but measurable
effect. Great idea, Andres!

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

graph.pngimage/png; name=graph.pngDownload
#108Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#101)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Mar 31, 2016 at 5:52 PM, Andres Freund <andres@anarazel.de> wrote:

Here's a WIP patch to evaluate. Dilip/Ashutosh, could you perhaps run
some benchmarks, to see whether this addresses the performance issues?

I guess it'd both be interesting to compare master with master + patch,
and this thread's latest patch with the patch additionally applied.

I tested it in Power and seen lot of fluctuations in the reading, From this
reading I could not reach to any conclusion.
only we can say that with (patch + pinunpin), we can reach more than
600000.

I think it needs more number of runs.. After seeing this results I did not
run head+pinunpin,

Head 64 Client 128 Client
-----------------------------------------------------
Run1 434860 356945
Run2 275815 *275815*
Run3 437872 366560
Patch 64 Client 128 Client
-----------------------------------------------------
Run1 429520 372958
Run2 446249 *167189*
Run3 431066 381592
Patch+Pinunpin 64 Client 128 Client
----------------------------------------------------------
Run1 338298 642535
Run2 406240 644187
Run3 595439 *285420 *

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#109Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#108)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-01 13:50:10 +0530, Dilip Kumar wrote:

I think it needs more number of runs.. After seeing this results I did not
run head+pinunpin,

Head 64 Client 128 Client
-----------------------------------------------------
Run1 434860 356945
Run2 275815 *275815*
Run3 437872 366560
Patch 64 Client 128 Client
-----------------------------------------------------
Run1 429520 372958
Run2 446249 *167189*
Run3 431066 381592
Patch+Pinunpin 64 Client 128 Client
----------------------------------------------------------
Run1 338298 642535
Run2 406240 644187
Run3 595439 *285420 *

Could you describe the exact setup a bit more? Postgres settings,
pgbench parameters, etc.

What's the size of BufferDesc after applying the patch?

Thanks,

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#110Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#109)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-01 10:35:18 +0200, Andres Freund wrote:

On 2016-04-01 13:50:10 +0530, Dilip Kumar wrote:

I think it needs more number of runs.. After seeing this results I did not
run head+pinunpin,

Head 64 Client 128 Client
-----------------------------------------------------
Run1 434860 356945
Run2 275815 *275815*
Run3 437872 366560
Patch 64 Client 128 Client
-----------------------------------------------------
Run1 429520 372958
Run2 446249 *167189*
Run3 431066 381592
Patch+Pinunpin 64 Client 128 Client
----------------------------------------------------------
Run1 338298 642535
Run2 406240 644187
Run3 595439 *285420 *

Could you describe the exact setup a bit more? Postgres settings,
pgbench parameters, etc.

What's the size of BufferDesc after applying the patch?

One interesting thing to do would be to use -P1 during the test and see
how much the performance varies over time.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#111Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#110)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Apr 1, 2016 at 2:09 PM, Andres Freund <andres@anarazel.de> wrote:

One interesting thing to do would be to use -P1 during the test and see
how much the performance varies over time.

I have run with -P option, I ran for 1200 second and set -P as 30 second,
and what I observed is that when its low its low throughout the run and
when its high, Its high for complete run.

Non Default Parameter:
--------------------------------
shared_buffer 8GB
Max Connections 150

./pgbench -c $threads -j $threads -T 1200 -M prepared -S -P 30 postgres

Test results are attached in result.tar file.

File details:
------------------
1. head_run1.txt --> 20 mins run on head
reading 1
2. head_run2.txt --> 20 mins run on head
reading 2
3. head_patch_run1.txt --> 20 mins run on head + patch
reading 1
4. head_patch_run2.txt --> 20 mins run on head + patch
reading 2
5. head_pinunpin.txt --> 20 mins run on head +
pinunpin-cas-8.patch
6. head_pinunpin_patch.txt --> 20 mins run on head +
pinunpin-cas-8.patch + patch reading

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Attachments:

result.tarapplication/x-tar; name=result.tarDownload
#112Amit Kapila
amit.kapila16@gmail.com
In reply to: Dilip Kumar (#111)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 3, 2016 at 9:55 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Fri, Apr 1, 2016 at 2:09 PM, Andres Freund <andres@anarazel.de> wrote:

One interesting thing to do would be to use -P1 during the test and see
how much the performance varies over time.

I have run with -P option, I ran for 1200 second and set -P as 30 second,
and what I observed is that when its low its low throughout the run and
when its high, Its high for complete run.

What is the conclusion of this test? As far as I see, with the patch
(0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect), the performance
degradation is not fixed, but with pin-unpin patch, the performance seems
to be better in most of the runs, however still you see less performance in
some of the runs. Is that right? Can you answer some of the questions
asked by Andres upthread[1]/messages/by-id/20160401083518.GE9074@awork2.anarazel.de?

[1]: /messages/by-id/20160401083518.GE9074@awork2.anarazel.de
/messages/by-id/20160401083518.GE9074@awork2.anarazel.de

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#113Dilip Kumar
dilipbalaut@gmail.com
In reply to: Amit Kapila (#112)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 3, 2016 at 2:28 PM, Amit Kapila <amit.kapila16@gmail.com> wrote:

What is the conclusion of this test? As far as I see, with the patch
(0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect), the performance
degradation is not fixed, but with pin-unpin patch, the performance seems
to be better in most of the runs, however still you see less performance in
some of the runs. Is that right?

Summary Of the Run:
-----------------------------
1. Throughout one run if we observe TPS every 30 seconds its stable in one
run.
2. With Head 64 client run vary between ~250,000 to ~450000. you can see
below results.

run1: 434860 (5min)
run2: 275815 (5min)
run3: 437872 (5min)
run4: 237033 (5min)
run5: 347611 (10min)
run6: 435933 (20min)

3. With Head + 0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect
with 64 client I always saw ~450,000 TPS

run1: 429520 (5min)
run2: 446249 (5min)
run3: 431066 (5min)
run4: 441280 (10min)
run5: 429844 (20 mins)

4. With Head+ 0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect
with 128 Client something performance is as low as ~150,000 which is never
observed with Head (with head it is constantly ~ 350,000 TPS).

run1: 372958 (5min)
run2: 167189 (5min)
run3: 381592 (5min)
run4: 441280 (10min)
run5: 362742 (20 min)

5. With Head+pinunpin-cas-8, with 64 client its ~ 550,000 TPS and with 128
client ~650,000 TPS.

6. With Head+ pinunpin-cas-8 +
0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect performance is
almost same as with
Head+pinunpin-cas-8, only sometime performance at 128 client is low
(~250,000 instead of 650,000)

Seems like Head+ pinunpin-cas-8 is giving best performance and without much
fluctuation.

Can you answer some of the questions asked by Andres upthread[1]?

[1] -
/messages/by-id/20160401083518.GE9074@awork2.anarazel.de

Non Default Parameter:
--------------------------------
shared_buffer 8GB
Max Connections 150

./pgbench -c $threads -j $threads -T 1200 -M prepared -S -P 30 postgres

*BufferDesc Size:*
*------------------------*
Head: 80 Bytes
Head+0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect : 72Bytes
Head+0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect+
Pinunpin-cas-8 : 64 Bytes

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#114Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#113)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 2016-04-03 16:47:49 +0530, Dilip Kumar wrote:

6. With Head+ pinunpin-cas-8 +
0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect performance is
almost same as with
Head+pinunpin-cas-8, only sometime performance at 128 client is low
(~250,000 instead of 650,000)

Hm, interesting. I suspect that's because of the missing backoff in my
experimental patch. If you apply the attached patch ontop of that
(requires infrastructure from pinunpin), how does performance develop?

Regards,

Andres

Attachments:

backoff.patchtext/x-patch; charset=us-asciiDownload
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index ec6baf6..4216be5 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -858,11 +858,15 @@ LWLockWaitListLock(LWLock *lock)
 	{
 		if (old_state & LW_FLAG_LOCKED)
 		{
-			/* FIXME: add exponential backoff */
-			pg_spin_delay();
-			old_state = pg_atomic_read_u32(&lock->state);
+			SpinDelayStatus delayStatus = init_spin_delay((void*)&lock->state);
+			while (old_state & LW_FLAG_LOCKED)
+			{
+				perform_spin_delay(&delayStatus);
+				old_state = pg_atomic_read_u32(&lock->state);
+			}
+			finish_spin_delay(&delayStatus);
 #ifdef LWLOCK_STATS
-			delays++;
+			delays += delayStatus.delays;
 #endif
 		}
 		else
#115Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#114)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Apr 4, 2016 at 2:28 PM, Andres Freund <andres@anarazel.de> wrote:

Hm, interesting. I suspect that's because of the missing backoff in my
experimental patch. If you apply the attached patch ontop of that
(requires infrastructure from pinunpin), how does performance develop?

I have applied this patch also, but still results are same, I mean around
550,000 with 64 threads and 650,000 with 128 client with lot of
fluctuations..

*128 client
**(head+**0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect
+pinunpin-cas-9+backoff)*

run1 645769
run2 643161
run3 *285546*
run4 *289421*
run5 630772
run6 *284363*

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

#116Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Dilip Kumar (#115)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 10:26 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Mon, Apr 4, 2016 at 2:28 PM, Andres Freund <andres@anarazel.de> wrote:

Hm, interesting. I suspect that's because of the missing backoff in my
experimental patch. If you apply the attached patch ontop of that
(requires infrastructure from pinunpin), how does performance develop?

I have applied this patch also, but still results are same, I mean around
550,000 with 64 threads and 650,000 with 128 client with lot of
fluctuations..

*128 client **(head+**0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect
+pinunpin-cas-9+backoff)*

run1 645769
run2 643161
run3 *285546*
run4 *289421*
run5 630772
run6 *284363*

Could the reason be that we're increasing concurrency for LWLock state
atomic variable by placing queue spinlock there?
But I wonder why this could happen during "pgbench -S", because it doesn't
seem to have high traffic of exclusive LWLocks.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#117Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#116)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-05 17:36:49 +0300, Alexander Korotkov wrote:

Could the reason be that we're increasing concurrency for LWLock state
atomic variable by placing queue spinlock there?

Don't think so, it's the same cache-line either way.

But I wonder why this could happen during "pgbench -S", because it doesn't
seem to have high traffic of exclusive LWLocks.

Yea, that confuses me too. I suspect there's some mis-aligned
datastructures somewhere. It's hard to investigate such things without
access to hardware.

(FWIW, I'm working on getting pinunpin committed)

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#118Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#117)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 8:15 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 17:36:49 +0300, Alexander Korotkov wrote:

Could the reason be that we're increasing concurrency for LWLock state
atomic variable by placing queue spinlock there?

Don't think so, it's the same cache-line either way.

But I wonder why this could happen during "pgbench -S", because it

doesn't

seem to have high traffic of exclusive LWLocks.

Yea, that confuses me too. I suspect there's some mis-aligned
datastructures somewhere. It's hard to investigate such things without
access to hardware.

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1]/messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the results.

(FWIW, I'm working on getting pinunpin committed)

Good to know, but I am slightly worried that it will make the problem
harder to detect as it will reduce the reproducibility. I understand that
we are running short of time and committing this patch is important, so we
should proceed with it as this is not a problem of this patch. After this
patch gets committed, we always need to revert it locally to narrow down
the problem due to commit 6150a1b0.

[1]: /messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com
/messages/by-id/CAA4eK1+ZeB8PMwwktf+3bRS0Pt4Ux6Rs6Aom0uip8c6shJWmyg@mail.gmail.com

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#119Andres Freund
andres@anarazel.de
In reply to: Amit Kapila (#118)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-05 20:56:31 +0530, Amit Kapila wrote:

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1] and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the results.

I don't see what that buys us. That commit is a good win on x86...

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#120Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#119)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 9:00 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 20:56:31 +0530, Amit Kapila wrote:

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1] and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the

results.

I don't see what that buys us. That commit is a good win on x86...

At least, that way we can see the results of pin-unpin without
fluctuation. I agree that we need to narrow down why even after reducing
BufferDesc size, we are seeing performance fluctuation.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#121Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#119)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 11:30 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 20:56:31 +0530, Amit Kapila wrote:

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1] and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the results.

I don't see what that buys us. That commit is a good win on x86...

Maybe. But I wouldn't be surprised to find out that that is an
overgeneralization. Based on some results Mithun Cy showed me this
morning, I think that some of this enormous run-to-run fluctuation
that we're seeing is due to NUMA effects. So some runs we get two
things that are frequently accessed together on the same NUMA node and
other times they get placed on different NUMA nodes and then
everything sucks. I don't think we fully understand what's going on
here yet - and I think we're committing changes in this area awfully
quickly - but I see no reason to believe that x86 is immune to such
effects. They may just happen in different scenarios than what we see
on POWER.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#122Andres Freund
andres@anarazel.de
In reply to: Robert Haas (#121)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-05 12:14:35 -0400, Robert Haas wrote:

On Tue, Apr 5, 2016 at 11:30 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 20:56:31 +0530, Amit Kapila wrote:

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1] and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the results.

I don't see what that buys us. That commit is a good win on x86...

Maybe. But I wouldn't be surprised to find out that that is an
overgeneralization. Based on some results Mithun Cy showed me this
morning, I think that some of this enormous run-to-run fluctuation
that we're seeing is due to NUMA effects. So some runs we get two
things that are frequently accessed together on the same NUMA node and
other times they get placed on different NUMA nodes and then
everything sucks. I don't think we fully understand what's going on
here yet - and I think we're committing changes in this area awfully
quickly - but I see no reason to believe that x86 is immune to such
effects. They may just happen in different scenarios than what we see
on POWER.

I'm not really following - we were talking about 6150a1b0 ("Move buffer
I/O and content LWLocks out of the main tranche.") made four months
ago. Afaics the atomic buffer pin patch is a pretty clear win on both
ppc and x86?

I agree that there's numa effects we don't understand. I think
re-considering Kevin's patch that did explicit numa hinting on linux
would be rather worthwhile. It makes a lot of sense to force shmem to be
force-interleaved, but make backend local memory, uh, local.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#123Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#122)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 1:04 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 12:14:35 -0400, Robert Haas wrote:

On Tue, Apr 5, 2016 at 11:30 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 20:56:31 +0530, Amit Kapila wrote:

This fluctuation started appearing after commit 6150a1b0 which we have
discussed in another thread [1] and a colleague of mine is working on to
write a patch to try to revert it on current HEAD and then see the results.

I don't see what that buys us. That commit is a good win on x86...

Maybe. But I wouldn't be surprised to find out that that is an
overgeneralization. Based on some results Mithun Cy showed me this
morning, I think that some of this enormous run-to-run fluctuation
that we're seeing is due to NUMA effects. So some runs we get two
things that are frequently accessed together on the same NUMA node and
other times they get placed on different NUMA nodes and then
everything sucks. I don't think we fully understand what's going on
here yet - and I think we're committing changes in this area awfully
quickly - but I see no reason to believe that x86 is immune to such
effects. They may just happen in different scenarios than what we see
on POWER.

I'm not really following - we were talking about 6150a1b0 ("Move buffer
I/O and content LWLocks out of the main tranche.") made four months
ago. Afaics the atomic buffer pin patch is a pretty clear win on both
ppc and x86?

The point is that the testing Amit's team is doing can't tell the
answer to that question one way or another. 6150a1b0 completely
destabilized performance on our test systems to the point where
testing subsequent patches is extremely difficult.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#124Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#117)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 5, 2016 at 5:45 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-05 17:36:49 +0300, Alexander Korotkov wrote:

Could the reason be that we're increasing concurrency for LWLock state
atomic variable by placing queue spinlock there?

Don't think so, it's the same cache-line either way.

Yes, it's very unlikely.

But I wonder why this could happen during "pgbench -S", because it doesn't

seem to have high traffic of exclusive LWLocks.

Yea, that confuses me too. I suspect there's some mis-aligned
datastructures somewhere. It's hard to investigate such things without
access to hardware.

But it's quite easy to check if it is alignment issue. We can try your
patch but without removing mutex from LWLock struct. If it's alignment
issue, then TPS should become stable again.

(FWIW, I'm working on getting pinunpin committed)

Sounds good.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#125Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#115)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-05 12:56:46 +0530, Dilip Kumar wrote:

On Mon, Apr 4, 2016 at 2:28 PM, Andres Freund <andres@anarazel.de> wrote:

Hm, interesting. I suspect that's because of the missing backoff in my
experimental patch. If you apply the attached patch ontop of that
(requires infrastructure from pinunpin), how does performance develop?

I have applied this patch also, but still results are same, I mean around
550,000 with 64 threads and 650,000 with 128 client with lot of
fluctuations..

*128 client
**(head+**0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect
+pinunpin-cas-9+backoff)*

run1 645769
run2 643161
run3 *285546*
run4 *289421*
run5 630772
run6 *284363*

I wonder what http://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=09adc9a8c09c9640de05c7023b27fb83c761e91c
does to all these numbers. It seems entirely possible that "this" is
mainly changing the alignment of some common datastructures...

- Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#126Andres Freund
andres@anarazel.de
In reply to: Dilip Kumar (#113)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 2016-04-03 16:47:49 +0530, Dilip Kumar wrote:

Summary Of the Run:
-----------------------------
1. Throughout one run if we observe TPS every 30 seconds its stable in one
run.
2. With Head 64 client run vary between ~250,000 to ~450000. you can see
below results.

run1: 434860 (5min)
run2: 275815 (5min)
run3: 437872 (5min)
run4: 237033 (5min)
run5: 347611 (10min)
run6: 435933 (20min)

[1] -
/messages/by-id/20160401083518.GE9074@awork2.anarazel.de

Non Default Parameter:
--------------------------------
shared_buffer 8GB
Max Connections 150

./pgbench -c $threads -j $threads -T 1200 -M prepared -S -P 30 postgres

Which scale did you initialize with? I'm trying to reproduce the
workload on hydra as precisely as possible...

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#127Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#126)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-06 11:52:28 +0200, Andres Freund wrote:

Hi,

On 2016-04-03 16:47:49 +0530, Dilip Kumar wrote:

Summary Of the Run:
-----------------------------
1. Throughout one run if we observe TPS every 30 seconds its stable in one
run.
2. With Head 64 client run vary between ~250,000 to ~450000. you can see
below results.

run1: 434860 (5min)
run2: 275815 (5min)
run3: 437872 (5min)
run4: 237033 (5min)
run5: 347611 (10min)
run6: 435933 (20min)

[1] -
/messages/by-id/20160401083518.GE9074@awork2.anarazel.de

Non Default Parameter:
--------------------------------
shared_buffer 8GB
Max Connections 150

./pgbench -c $threads -j $threads -T 1200 -M prepared -S -P 30 postgres

Which scale did you initialize with? I'm trying to reproduce the
workload on hydra as precisely as possible...

On hydra, even after a fair amount of tinkering, I cannot reprodue such
variability.

Pin/Unpin only has a minor effect itself, reducing the size of lwlock
on-top improves performance (378829 to 409914 TPS) in
intentionally short 15s tests (warm cache) with 128 clients; as well as
to a lower degree in 120s tests (415921 to 420487).

It appears, over 5 runs, that the alignment fix shortens the rampup
phase from about 100s to about 8; interesting.

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#128Dilip Kumar
dilipbalaut@gmail.com
In reply to: Andres Freund (#126)
2 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Apr 6, 2016 at 3:22 PM, Andres Freund <andres@anarazel.de> wrote:

Which scale did you initialize with? I'm trying to reproduce the
workload on hydra as precisely as possible...

I tested with scale factor 300, shared buffer 8GB.

My test script is attached with the mail (perf_pgbench_ro.sh).

I have done some more test on power (same machine)

Test1:
--------
head + pinunpin-cas-9.patch + BufferDesc content lock to pointer (patch
attached: buffer_content_lock_ptr**.patch)

Ashutosh helped me in generating this patch (this is just temp patch to see
the pin/unpin behaviour if content lock is pointer)

64 client
run1 497684
run2 543366
run3 476988
128 Client
run1 740301
run2 482676
run3 474530
run4 480971
run5 757779

*Summary:*
1. With 64 client I think whether we apply only
pinunpin-cas-9.patch or we apply pinunpin-cas-9.patch +
buffer_content_lock_ptr_rebased_head_temp.patch
max TPS is ~550,000 and some fluctuations.

2. With 128, we saw in earlier post that with pinunpin we were getting max
TPS was 650,000 (even after converting BufferDesc to 64 bytes it was
650,000. Now after converting content lock to pointer on top of pinunpin I
get max as ~750,000.

- One more point to be noted, earlier it was varying from 250,000 to
650,000 but after converting content lock to pointer its varying from
450,000 to 750000.

*Test2:*

Head + buffer_content_lock_ptr_rebased_head_temp.patch

1. With this test reading is same as head, and can see same variance in
performance.

--
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Attachments:

perf_pgbench_ro.shapplication/x-sh; name=perf_pgbench_ro.shDownload
buffer_content_lock_ptr_rebased_head_temp.patchapplication/octet-stream; name=buffer_content_lock_ptr_rebased_head_temp.patchDownload
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index bfa37f1..5c5d91d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -73,6 +73,7 @@ InitBufferPool(void)
 				foundDescs,
 				foundIOLocks,
 				foundBufCkpt;
+	LWLockPadded	*content_locks;
 
 	/* Align descriptors to a cacheline boundary. */
 	BufferDescriptors = (BufferDescPadded *)
@@ -98,11 +99,12 @@ InitBufferPool(void)
 	BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
 	LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
 						  &BufferIOLWLockTranche);
-
+#if 1
+	content_locks = (LWLockPadded *) ShmemAlloc(sizeof(LWLockPadded) * NBuffers);
+#endif
 	BufferContentLWLockTranche.name = "buffer_content";
-	BufferContentLWLockTranche.array_base =
-		((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
-	BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
+	BufferContentLWLockTranche.array_base = content_locks;
+	BufferContentLWLockTranche.array_stride = sizeof(LWLockPadded);
 	LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
 						  &BufferContentLWLockTranche);
 
@@ -149,9 +151,12 @@ InitBufferPool(void)
 			 * management of this list is done by freelist.c.
 			 */
 			buf->freeNext = i + 1;
+//			elog(LOG, "Ashu_before_init=%d", i);
 
-			LWLockInitialize(BufferDescriptorGetContentLock(buf),
+			LWLockInitialize(&content_locks[i].lock,
 							 LWTRANCHE_BUFFER_CONTENT);
+//			elog(LOG, "Ashu_after_init=%d", i);
+			buf->content_lock = &content_locks[i].lock;
 
 			LWLockInitialize(BufferDescriptorGetIOLock(buf),
 							 LWTRANCHE_BUFFER_IO_IN_PROGRESS);
@@ -205,6 +210,7 @@ BufferShmemSize(void)
 
 	/* size of checkpoint sort array in bufmgr.c */
 	size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
+	size = add_size(size, (sizeof(LWLockPadded) * NBuffers));
 
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 42aa2f9..0273bbc 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -778,7 +778,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (!isLocalBuf)
 			{
 				if (mode == RBM_ZERO_AND_LOCK)
-					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+					LWLockAcquire(bufHdr->content_lock,
 								  LW_EXCLUSIVE);
 				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
 					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
@@ -927,7 +927,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
 		!isLocalBuf)
 	{
-		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+		LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
 	}
 
 	if (isLocalBuf)
@@ -1093,7 +1093,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
-			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+			if (LWLockConditionalAcquire(buf->content_lock,
 										 LW_SHARED))
 			{
 				/*
@@ -1116,7 +1116,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 						StrategyRejectBuffer(strategy, buf))
 					{
 						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(BufferDescriptorGetContentLock(buf));
+						LWLockRelease(buf->content_lock);
 						UnpinBuffer(buf, true);
 						continue;
 					}
@@ -1129,7 +1129,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 											  smgr->smgr_rnode.node.relNode);
 
 				FlushBuffer(buf, NULL);
-				LWLockRelease(BufferDescriptorGetContentLock(buf));
+				LWLockRelease(buf->content_lock);
 
 				ScheduleBufferTagForWriteback(&BackendWritebackContext,
 											  &buf->tag);
@@ -1447,7 +1447,7 @@ MarkBufferDirty(Buffer buffer)
 
 	Assert(BufferIsPinned(buffer));
 	/* unfortunately we can't check if the lock is held exclusively */
-	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
 	LockBufHdr(bufHdr);
 
@@ -1647,7 +1647,7 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 	if (ref->refcount == 0)
 	{
 		/* I'd better not still hold any locks on the buffer */
-		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
 		LockBufHdr(buf);
@@ -2294,11 +2294,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 * buffer is clean by the time we've locked it.)
 	 */
 	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
 
 	FlushBuffer(bufHdr, NULL);
 
-	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+	LWLockRelease(bufHdr->content_lock);
 
 	tag = bufHdr->tag;
 
@@ -3109,9 +3109,9 @@ FlushRelationBuffers(Relation rel)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
 			FlushBuffer(bufHdr, rel->rd_smgr);
-			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			LWLockRelease(bufHdr->content_lock);
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -3161,9 +3161,9 @@ FlushDatabaseBuffers(Oid dbid)
 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
-			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			LWLockAcquire(bufHdr->content_lock, LW_SHARED);
 			FlushBuffer(bufHdr, NULL);
-			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			LWLockRelease(bufHdr->content_lock);
 			UnpinBuffer(bufHdr, true);
 		}
 		else
@@ -3187,7 +3187,7 @@ FlushOneBuffer(Buffer buffer)
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 
-	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
 	FlushBuffer(bufHdr, NULL);
 }
@@ -3284,7 +3284,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 
 	Assert(GetPrivateRefCount(buffer) > 0);
 	/* here, either share or exclusive lock is OK */
-	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+	Assert(LWLockHeldByMe(bufHdr->content_lock));
 
 	/*
 	 * This routine might get called many times on the same page, if we are
@@ -3437,11 +3437,11 @@ LockBuffer(Buffer buffer, int mode)
 	buf = GetBufferDescriptor(buffer - 1);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(BufferDescriptorGetContentLock(buf));
+		LWLockRelease(buf->content_lock);
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+		LWLockAcquire(buf->content_lock, LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -3462,7 +3462,7 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+	return LWLockConditionalAcquire(buf->content_lock,
 									LW_EXCLUSIVE);
 }
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index d04363b..e7d06f3 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -151,7 +151,7 @@ typedef struct BufferDesc
 	int			buf_id;			/* buffer's index number (from 0) */
 	int			freeNext;		/* link in freelist chain */
 
-	LWLock		content_lock;	/* to lock access to buffer contents */
+	LWLock		*content_lock;	/* to lock access to buffer contents */
 } BufferDesc;
 
 /*
@@ -189,8 +189,6 @@ typedef union BufferDescPadded
 
 #define BufferDescriptorGetIOLock(bdesc) \
 	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
-#define BufferDescriptorGetContentLock(bdesc) \
-	((LWLock*) (&(bdesc)->content_lock))
 
 extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 
#129Robert Haas
robertmhaas@gmail.com
In reply to: Dilip Kumar (#128)
Re: Move PinBuffer and UnpinBuffer to atomics

On Wed, Apr 6, 2016 at 10:04 AM, Dilip Kumar <dilipbalaut@gmail.com> wrote:

On Wed, Apr 6, 2016 at 3:22 PM, Andres Freund <andres@anarazel.de> wrote:

Which scale did you initialize with? I'm trying to reproduce the
workload on hydra as precisely as possible...

I tested with scale factor 300, shared buffer 8GB.

My test script is attached with the mail (perf_pgbench_ro.sh).

I have done some more test on power (same machine)

I spent a lot of time testing things on power2 today and also
discussed with Andres via IM. Andres came up with a few ideas to
reduce the variability, which I tried. One of those was to run the
server under numactl --interleave=all (that is, numactl
--interleave=all pg_ctl start etc.) and another was to set
kernel.numa_balancing = 0 (it was set to 1). Neither of those things
seemed to prevent the problem of run-to-run variability. Andres also
suggested running pgbench with "-P 1", which revealed that it was
generally possible to tell what the overall performance of a run was
going to be like within 10-20 seconds. Runs that started out fast
stayed fast, and those that started out slower remained slower.
Therefore, long runs didn't seem to be necessary for testing, so I
switched to using 2-minute test runs launched via pgbench -T 120 -c 64
-j 64 -n -M prepared -S -P1.

After quite a bit of experimentation, Andres hit on an idea that did
succeed in drastically reducing the run-to-run variation: prewarming
all of the relations in a deterministic order before starting the
test. I used this query:

psql -c "select sum(x.x) from (select pg_prewarm(oid) as x from
pg_class where relkind in ('i', 'r') order by oid) x;"

With that change to my test script, the results became much more
stable. I tested four different builds of the server: commit 3fed4174
(that is, the one just before where you have been reporting the
variability to have begun), commit 6150a1b0 (the one you reported that
the variability actually did begin), master as of this morning
(actually commit cac0e366), and master + pinunpin-cas-9.patch +
0001-WIP-Avoid-the-use-of-a-separate-spinlock-to-protect-.patch +
backoff.patch (herein called "andres"). The first and last of these
have 64-byte BufferDescs, and the others have 80-byte BufferDescs.
Without prewarming, I see high and low results on *all* of these
builds, even 3fed4174. I did nine test runs with each configuration
with and without prewarming, and here are the results. With each
result I have reported the raw numbers, plus the median and the range
(highest result - lowest result).

-- without prewarming --
3fed4174
tps by run: 249165.928992 300958.039880 501281.083247
488073.289603 251033.038275 272086.063197
522287.023374 528855.922328 266057.502255
median: 300958.039880, range: 206687.132100

6150a1b0
tps by run: 455853.061092 438628.888369 353643.017507
419850.232715 424353.870515 440219.581180
431805.001465 237528.175877 431789.666417
median: 431789.666417, range: 218324.885215

master
tps by run: 427808.559919 366625.640433 376165.188508
441349.141152 363381.095198 352925.252345
348975.712841 446626.284308 444633.921009
median: 376165.188508, range: 97650.571467

andres
tps by run: 391123.866928 423525.415037 496103.017599
346707.246825 531194.321999 466100.337795
517708.470146 355392.837942 510817.921728
median: 466100.337795, range: 184487.075174

-- with prewarming --
3fed4174
tps by run: 413239.471751 428996.202541 488206.103941
493788.533491 497953.728875 498074.092686
501719.406720 508880.505416 509868.266778
median: 497953.728875, range: 96628.795027

6150a1b0
tps by run: 421589.299889 438765.851782 440601.270742
440649.818900 443033.460295 447317.269583
452831.480337 456387.316178 460140.159903
median: 443033.460295, range: 38550.860014

master
tps by run: 427211.917303 427796.174209 435601.396857
436581.431219 442329.269335 446438.814270
449970.003595 450085.123059 456231.229966
median: 442329.269335, range: 29019.312663

andres
tps by run: 425513.771259 429219.907952 433838.084721
451354.257738 494735.045808 495301.319716
517166.054466 531655.887669 546984.476602
median: 494735.045808, range: 121470.705343

My belief is that the first set of numbers has so much jigger that you
can't really draw any meaningful conclusions. For two of the four
branches, the range is more than 50% of the median, which is enormous.
You could probably draw some conclusions if you took enough
measurements, but it's pretty hard. Notice that that set of numbers
makes 6150a1b0 look like a performance improvement, whereas the second
set makes it pretty clear that 6150a1b0 was a regression.

Also, the patch set is clearly winning here. It picks up 90k TPS
median on the first set of numbers and 50k TPS median on the second
set.

It's fairly mysterious to me why there is so much jitter in the
results on this machine. By doing prewarming in a consistent fashion,
we make sure that every disk run puts the same disk blocks in the same
buffers. Andres guessed that maybe the degree of associativity of the
CPU caches comes into play here: depending on where the hot data is we
either get the important cache lines in places where they can all be
cached at once, or we get them in places where they can't all be
cached at once. But if that's really what is going on here, it's
shocking that it makes this much difference.

However, my conclusion based on these results is that (1) the patch is
a win and (2) the variability on this machine didn't begin with
6150a1b0. YMMV, of course, but that's what I think.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#130Andres Freund
andres@anarazel.de
In reply to: Robert Haas (#129)
Re: Move PinBuffer and UnpinBuffer to atomics

Hi,

On 2016-04-06 21:58:50 -0400, Robert Haas wrote:

I spent a lot of time testing things on power2 today

Thanks for that!

It's fairly mysterious to me why there is so much jitter in the
results on this machine. By doing prewarming in a consistent fashion,
we make sure that every disk run puts the same disk blocks in the same
buffers. Andres guessed that maybe the degree of associativity of the
CPU caches comes into play here: depending on where the hot data is we
either get the important cache lines in places where they can all be
cached at once, or we get them in places where they can't all be
cached at once. But if that's really what is going on here, it's
shocking that it makes this much difference.

I'm not sure at all that that is indeed the reason. I think it'd be
quite worthwhile to
a) collect perf stat -ddd of a slow and a fast run, compare
b) collect a perf profile of a slow and fast run, run perf diff
c) add perf probes for lwlocks and spinlocks where we ended up sleeping,
check whether the two runs have different distribution of which
locks ended up sleeping.

I'm also wondering if the idea from
http://archives.postgresql.org/message-id/20160407082711.q7iq3ykffqxcszkv%40alap3.anarazel.de
both with/without preloading, changes variability.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#131Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#106)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-03-31 20:21:02 +0300, Alexander Korotkov wrote:

! BEGIN_BUFSTATE_CAS_LOOP(bufHdr);

! Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! wasDirty = (state & BM_DIRTY) ? true : false;
! state |= BM_DIRTY | BM_JUST_DIRTIED;
! if (state == oldstate)
! break;

I'm doubtful that this early exit is entirely safe. None of the
preceding operations imply a memory barrier. The buffer could previously
have been marked dirty, but cleaned since. It's pretty critical that we
re-set the dirty bit (there's no danger of loosing it with a barrier,
because we hold an exclusive content lock).

Practically the risk seems fairly low, because acquiring the exclusive
content lock will have implied a barrier. But it seems unlikely to have
a measurable performance effect to me, so I'd rather not add the early
exit.

Andres

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#132Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#131)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Apr 7, 2016 at 4:41 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-31 20:21:02 +0300, Alexander Korotkov wrote:

! BEGIN_BUFSTATE_CAS_LOOP(bufHdr);

! Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! wasDirty = (state & BM_DIRTY) ? true : false;
! state |= BM_DIRTY | BM_JUST_DIRTIED;
! if (state == oldstate)
! break;

I'm doubtful that this early exit is entirely safe. None of the
preceding operations imply a memory barrier. The buffer could previously
have been marked dirty, but cleaned since. It's pretty critical that we
re-set the dirty bit (there's no danger of loosing it with a barrier,
because we hold an exclusive content lock).

Oh, I get it.

Practically the risk seems fairly low, because acquiring the exclusive
content lock will have implied a barrier. But it seems unlikely to have
a measurable performance effect to me, so I'd rather not add the early
exit.

Ok, let's just remove it.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#133Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#132)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-07 16:50:44 +0300, Alexander Korotkov wrote:

On Thu, Apr 7, 2016 at 4:41 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-03-31 20:21:02 +0300, Alexander Korotkov wrote:

! BEGIN_BUFSTATE_CAS_LOOP(bufHdr);

! Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! wasDirty = (state & BM_DIRTY) ? true : false;
! state |= BM_DIRTY | BM_JUST_DIRTIED;
! if (state == oldstate)
! break;

I'm doubtful that this early exit is entirely safe. None of the
preceding operations imply a memory barrier. The buffer could previously
have been marked dirty, but cleaned since. It's pretty critical that we
re-set the dirty bit (there's no danger of loosing it with a barrier,
because we hold an exclusive content lock).

Oh, I get it.

Practically the risk seems fairly low, because acquiring the exclusive
content lock will have implied a barrier. But it seems unlikely to have
a measurable performance effect to me, so I'd rather not add the early
exit.

Ok, let's just remove it.

Here's my updated version of the patch. I've updated this on an
intercontinental flight, after a otherwise hectic week (moving from SF
to Berlin); so I'm planning to look over this once more before pushing (.

I've decided that the cas-loop macros are too obfuscating for my
taste. To avoid duplicating the wait part I've introduced
WaitBufHdrUnlocked().

As you can see in
http://archives.postgresql.org/message-id/CA%2BTgmoaeRbN%3DZ4oWENLvgGLeHEvGZ_S_Z3KGrdScyKiSvNt3oA%40mail.gmail.com
I'm planning to apply this sometime this weekend, after running some
tests and going over the patch again.

Any chance you could have a look over this?

Regards,

Andres

Attachments:

0001-Allow-Pin-UnpinBuffer-to-operate-in-a-lockfree-manne.patchtext/x-patch; charset=us-asciiDownload
From e89e99acc5b0854f918f0c7f685efcd50e6ffcae Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 7 Apr 2016 10:29:41 +0200
Subject: [PATCH] Allow Pin/UnpinBuffer to operate in a lockfree manner.

Pinning/Unpinning a buffer is a very frequent operation; especially in
read-mostly cache resident workloads. Benchmarking shows that in various
scenarios the spinlock protecting a buffer header's state becomes a
significant bottleneck. The problem can be reproduced with pgbench -S on
larger machines, but can be considerably worse for queries which touch
the same buffers over and over at a high frequency (e.g. nested loops
over a small inner table).

To allow atomic operations to be used, cram BufferDesc's flags,
usage_count, buf_hdr_lock, refcount into a single 32bit atomic variable;
that allows to manipulate them together using 32bit compare-and-swap
operations. This requires reducing MAX_BACKENDS to 2^18-1 (which could
be lifted by using a 64bit field, but it's not a realistic configuration
atm).

As not all operations can easily implemented in a lockfree manner,
implement the previous buf_hdr_lock via a flag bit in the atomic
variable. That way we can continue to lock the header in places where
it's needed, but can get away without acquiring it in the more frequent
hot-paths.  There's some additional operations which can be done without
the lock, but aren't in this patch; but the most important places are
covered.

As bufmgr.c now essentially re-implements spinlocks, abstract the delay
logic from s_lock.c into something more generic. It now has already two
users, and it seems likely that more are coming up; there's pending
patches for lwlock.c at least.

This patch is based on a proof-of-concept written by me, which Alexander
Korotkov made into a fully working patch; the committed version is again
revised by me.  Benchmarking and testing has, amongst others, been
provided by Dilip Kumar, Alexander Korotkov, Robert Haas.

Author: Alexander Korotkov and Andres Freund
Discussion: 2400449.GjM57CE0Yg@dinodell
---
 contrib/pg_buffercache/pg_buffercache_pages.c |  15 +-
 src/backend/storage/buffer/buf_init.c         |  18 +-
 src/backend/storage/buffer/bufmgr.c           | 491 +++++++++++++++++---------
 src/backend/storage/buffer/freelist.c         |  44 ++-
 src/backend/storage/buffer/localbuf.c         |  64 ++--
 src/backend/storage/lmgr/s_lock.c             | 218 ++++++------
 src/include/postmaster/postmaster.h           |  15 +-
 src/include/storage/buf_internals.h           | 101 ++++--
 src/include/storage/s_lock.h                  |  18 +
 src/tools/pgindent/typedefs.list              |   1 +
 10 files changed, 621 insertions(+), 364 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 6622d22..17b4b6f 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -148,11 +148,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 */
 		for (i = 0; i < NBuffers; i++)
 		{
-			volatile BufferDesc *bufHdr;
+			BufferDesc *bufHdr;
+			uint32		buf_state;
 
 			bufHdr = GetBufferDescriptor(i);
 			/* Lock each buffer header before inspecting. */
-			LockBufHdr(bufHdr);
+			buf_state = LockBufHdr(bufHdr);
 
 			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
 			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
@@ -160,21 +161,21 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
 			fctx->record[i].forknum = bufHdr->tag.forkNum;
 			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = bufHdr->usage_count;
-			fctx->record[i].pinning_backends = bufHdr->refcount;
+			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
 
-			if (bufHdr->flags & BM_DIRTY)
+			if (buf_state & BM_DIRTY)
 				fctx->record[i].isdirty = true;
 			else
 				fctx->record[i].isdirty = false;
 
 			/* Note if the buffer is valid, and has storage created */
-			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
+			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
 				fctx->record[i].isvalid = true;
 			else
 				fctx->record[i].isvalid = false;
 
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, buf_state);
 		}
 
 		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index bfa37f1..702f50b 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -135,12 +135,9 @@ InitBufferPool(void)
 			BufferDesc *buf = GetBufferDescriptor(i);
 
 			CLEAR_BUFFERTAG(buf->tag);
-			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
-			buf->wait_backend_pid = 0;
 
-			SpinLockInit(&buf->buf_hdr_lock);
+			pg_atomic_init_u32(&buf->state, 0);
+			buf->wait_backend_pid = 0;
 
 			buf->buf_id = i;
 
@@ -193,11 +190,12 @@ BufferShmemSize(void)
 
 	/*
 	 * It would be nice to include the I/O locks in the BufferDesc, but that
-	 * would increase the size of a BufferDesc to more than one cache line, and
-	 * benchmarking has shown that keeping every BufferDesc aligned on a cache
-	 * line boundary is important for performance.  So, instead, the array of
-	 * I/O locks is allocated in a separate tranche.  Because those locks are
-	 * not highly contentended, we lay out the array with minimal padding.
+	 * would increase the size of a BufferDesc to more than one cache line,
+	 * and benchmarking has shown that keeping every BufferDesc aligned on a
+	 * cache line boundary is important for performance.  So, instead, the
+	 * array of I/O locks is allocated in a separate tranche.  Because those
+	 * locks are not highly contentended, we lay out the array with minimal
+	 * padding.
 	 */
 	size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
 	/* to allow aligning the above */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 42aa2f9..5ca19b2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -427,7 +427,6 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
 	(GetPrivateRefCount(bufnum) > 0) \
 )
 
-
 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
 				  ReadBufferMode mode, BufferAccessStrategy strategy,
@@ -436,11 +435,12 @@ static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
 static void BufferSync(int flags);
+static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
 static int	SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context);
 static void WaitIO(BufferDesc *buf);
 static bool StartBufferIO(BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-				  int set_flag_bits);
+				  uint32 set_flag_bits);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
 static BufferDesc *BufferAlloc(SMgrRelation smgr,
@@ -816,8 +816,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		if (isLocalBuf)
 		{
 			/* Only need to adjust flags */
-			Assert(bufHdr->flags & BM_VALID);
-			bufHdr->flags &= ~BM_VALID;
+			uint32		state = pg_atomic_read_u32(&bufHdr->state);
+
+			Assert(state & BM_VALID);
+			state &= ~BM_VALID;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 		else
 		{
@@ -828,10 +831,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 */
 			do
 			{
-				LockBufHdr(bufHdr);
-				Assert(bufHdr->flags & BM_VALID);
-				bufHdr->flags &= ~BM_VALID;
-				UnlockBufHdr(bufHdr);
+				uint32		state = LockBufHdr(bufHdr);
+
+				Assert(state & BM_VALID);
+				state &= ~BM_VALID;
+				UnlockBufHdr(bufHdr, state);
 			} while (!StartBufferIO(bufHdr, true));
 		}
 	}
@@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * it's not been recycled) but come right back here to try smgrextend
 	 * again.
 	 */
-	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
+	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */
 
 	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
@@ -933,7 +937,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	if (isLocalBuf)
 	{
 		/* Only need to adjust flags */
-		bufHdr->flags |= BM_VALID;
+		uint32		state = pg_atomic_read_u32(&bufHdr->state);
+
+		state |= BM_VALID;
+		pg_atomic_write_u32(&bufHdr->state, state);
 	}
 	else
 	{
@@ -987,10 +994,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferTag	oldTag;			/* previous identity of selected buffer */
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
 	int			buf_id;
 	BufferDesc *buf;
 	bool		valid;
+	uint32		buf_state;
 
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -1059,12 +1067,12 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * Select a victim buffer.  The buffer is returned with its header
 		 * spinlock still held!
 		 */
-		buf = StrategyGetBuffer(strategy);
+		buf = StrategyGetBuffer(strategy, &buf_state);
 
-		Assert(buf->refcount == 0);
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
 
 		/* Must copy buffer flags while we still hold the spinlock */
-		oldFlags = buf->flags;
+		oldFlags = buf_state & BUF_FLAG_MASK;
 
 		/* Pin the buffer and then release the buffer spinlock */
 		PinBuffer_Locked(buf);
@@ -1106,11 +1114,12 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				if (strategy != NULL)
 				{
 					XLogRecPtr	lsn;
+					uint32		state;
 
 					/* Read the LSN while holding buffer header lock */
-					LockBufHdr(buf);
+					state = LockBufHdr(buf);
 					lsn = BufferGetLSN(buf);
-					UnlockBufHdr(buf);
+					UnlockBufHdr(buf, state);
 
 					if (XLogNeedsFlush(lsn) &&
 						StrategyRejectBuffer(strategy, buf))
@@ -1254,7 +1263,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/*
 		 * Need to lock the buffer header too in order to change its tag.
 		 */
-		LockBufHdr(buf);
+		buf_state = LockBufHdr(buf);
 
 		/*
 		 * Somebody could have pinned or re-dirtied the buffer while we were
@@ -1262,11 +1271,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * recycle this buffer; we must undo everything we've done and start
 		 * over with a new victim buffer.
 		 */
-		oldFlags = buf->flags;
-		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+		oldFlags = buf_state & BUF_FLAG_MASK;
+		if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
 			break;
 
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, buf_state);
 		BufTableDelete(&newTag, newHash);
 		if ((oldFlags & BM_TAG_VALID) &&
 			oldPartitionLock != newPartitionLock)
@@ -1284,14 +1293,15 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
-	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
+	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
+				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
+				   BUF_USAGECOUNT_MASK);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
-		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+		buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
 	else
-		buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 1;
+		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 
-	UnlockBufHdr(buf);
+	UnlockBufHdr(buf, buf_state);
 
 	if (oldFlags & BM_TAG_VALID)
 	{
@@ -1338,12 +1348,15 @@ InvalidateBuffer(BufferDesc *buf)
 	BufferTag	oldTag;
 	uint32		oldHash;		/* hash value for oldTag */
 	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
-	BufFlags	oldFlags;
+	uint32		oldFlags;
+	uint32		state;
 
 	/* Save the original buffer tag before dropping the spinlock */
 	oldTag = buf->tag;
 
-	UnlockBufHdr(buf);
+	state = pg_atomic_read_u32(&buf->state);
+	Assert(state & BM_LOCKED);
+	UnlockBufHdr(buf, state);
 
 	/*
 	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
@@ -1362,12 +1375,12 @@ retry:
 	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
 
 	/* Re-lock the buffer header */
-	LockBufHdr(buf);
+	state = LockBufHdr(buf);
 
 	/* If it's changed while we were waiting for lock, do nothing */
 	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
 	{
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 		LWLockRelease(oldPartitionLock);
 		return;
 	}
@@ -1381,9 +1394,9 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	if (BUF_STATE_GET_REFCOUNT(state) != 0)
 	{
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
 		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
@@ -1396,12 +1409,10 @@ retry:
 	 * Clear out the buffer's tag and flags.  We must do this to ensure that
 	 * linear scans of the buffer array don't think the buffer is valid.
 	 */
-	oldFlags = buf->flags;
+	oldFlags = state & BUF_FLAG_MASK;
 	CLEAR_BUFFERTAG(buf->tag);
-	buf->flags = 0;
-	buf->usage_count = 0;
-
-	UnlockBufHdr(buf);
+	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+	UnlockBufHdr(buf, state);
 
 	/*
 	 * Remove the buffer from the lookup hashtable, if it was in there.
@@ -1433,6 +1444,8 @@ void
 MarkBufferDirty(Buffer buffer)
 {
 	BufferDesc *bufHdr;
+	uint32		state;
+	uint32		oldstate;
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -1449,24 +1462,31 @@ MarkBufferDirty(Buffer buffer)
 	/* unfortunately we can't check if the lock is held exclusively */
 	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
 
-	LockBufHdr(bufHdr);
+	oldstate = pg_atomic_read_u32(&bufHdr->state);
+	for (;;)
+	{
+		if (oldstate & BM_LOCKED)
+			oldstate = WaitBufHdrUnlocked(bufHdr);
 
-	Assert(bufHdr->refcount > 0);
+		state = oldstate;
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+		state |= BM_DIRTY | BM_JUST_DIRTIED;
+
+		if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state))
+			break;
+	}
 
 	/*
 	 * If the buffer was not dirty already, do vacuum accounting.
 	 */
-	if (!(bufHdr->flags & BM_DIRTY))
+	if (!(oldstate & BM_DIRTY))
 	{
 		VacuumPageDirty++;
 		pgBufferUsage.shared_blks_dirtied++;
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
-
-	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-
-	UnlockBufHdr(bufHdr);
 }
 
 /*
@@ -1531,6 +1551,10 @@ ReleaseAndReadBuffer(Buffer buffer,
  *
  * This should be applied only to shared buffers, never local ones.
  *
+ * Since buffers are pinned/unpinned very frequently, pin buffers without
+ * taking the buffer header lock; instead update the state variable in loop of
+ * CAS operations. Hopefully it's just a single CAS.
+ *
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
@@ -1547,23 +1571,34 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 
 	if (ref == NULL)
 	{
+		/* loop of CAS operations */
+		uint32		state;
+		uint32		oldstate;
+
 		ReservePrivateRefCountEntry();
 		ref = NewPrivateRefCountEntry(b);
 
-		LockBufHdr(buf);
-		buf->refcount++;
-		if (strategy == NULL)
+		oldstate = pg_atomic_read_u32(&buf->state);
+		for (;;)
 		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
+			if (oldstate & BM_LOCKED)
+				oldstate = WaitBufHdrUnlocked(buf);
+
+			state = oldstate;
+
+			/* increase refcount */
+			state += BUF_REFCOUNT_ONE;
+
+			/* increase usagecount unless already max */
+			if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
+				state += BUF_USAGECOUNT_ONE;
+
+			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
+			{
+				result = (state & BM_VALID) != 0;
+				break;
+			}
 		}
-		else
-		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
-		}
-		result = (buf->flags & BM_VALID) != 0;
-		UnlockBufHdr(buf);
 	}
 	else
 	{
@@ -1603,6 +1638,7 @@ PinBuffer_Locked(BufferDesc *buf)
 {
 	Buffer		b;
 	PrivateRefCountEntry *ref;
+	uint32		state;
 
 	/*
 	 * As explained, We don't expect any preexisting pins. That allows us to
@@ -1610,8 +1646,14 @@ PinBuffer_Locked(BufferDesc *buf)
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
-	buf->refcount++;
-	UnlockBufHdr(buf);
+	/*
+	 * Since we hold the buffer spinlock, we can update the buffer state in a
+	 * operation.
+	 */
+	state = pg_atomic_read_u32(&buf->state);
+	Assert(state & BM_LOCKED);
+	state += BUF_REFCOUNT_ONE;
+	UnlockBufHdr(buf, state);
 
 	b = BufferDescriptorGetBuffer(buf);
 
@@ -1646,30 +1688,50 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 	ref->refcount--;
 	if (ref->refcount == 0)
 	{
+		uint32		state;
+		uint32		oldstate;
+
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
 		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
-		LockBufHdr(buf);
+		/*
+		 * Decrement the shared reference count.
+		 *
+		 * Since buffer spinlock holder can update status using just write,
+		 * it's not safe to use atomic decrement here; thus use a CAS loop..
+		 */
+		oldstate = pg_atomic_read_u32(&buf->state);
+		for (;;)
+		{
+			if (oldstate & BM_LOCKED)
+				oldstate = WaitBufHdrUnlocked(buf);
 
-		/* Decrement the shared reference count */
-		Assert(buf->refcount > 0);
-		buf->refcount--;
+			state = oldstate;
+
+			state -= BUF_REFCOUNT_ONE;
+
+			if (pg_atomic_compare_exchange_u32(&buf->state, &oldstate, state))
+				break;
+		}
 
 		/* Support LockBufferForCleanup() */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
-			buf->refcount == 1)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
-			/* we just released the last pin other than the waiter's */
-			int			wait_backend_pid = buf->wait_backend_pid;
+			state = LockBufHdr(buf);
 
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
-			UnlockBufHdr(buf);
-			ProcSendSignal(wait_backend_pid);
+			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
+			{
+				/* we just released the last pin other than the waiter's */
+				int			wait_backend_pid = buf->wait_backend_pid;
+
+				state &= ~BM_PIN_COUNT_WAITER;
+				UnlockBufHdr(buf, state);
+				ProcSendSignal(wait_backend_pid);
+			}
+			else
+				UnlockBufHdr(buf, state);
 		}
-		else
-			UnlockBufHdr(buf);
-
 		ForgetPrivateRefCountEntry(ref);
 	}
 }
@@ -1687,6 +1749,7 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 static void
 BufferSync(int flags)
 {
+	uint32		state;
 	int			buf_id;
 	int			num_to_scan;
 	int			num_spaces;
@@ -1736,13 +1799,13 @@ BufferSync(int flags)
 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
 		 * SyncOneBuffer.
 		 */
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 
-		if ((bufHdr->flags & mask) == mask)
+		if ((state & mask) == mask)
 		{
 			CkptSortItem *item;
 
-			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+			state |= BM_CHECKPOINT_NEEDED;
 
 			item = &CkptBufferIds[num_to_scan++];
 			item->buf_id = buf_id;
@@ -1752,7 +1815,7 @@ BufferSync(int flags)
 			item->blockNum = bufHdr->tag.blockNum;
 		}
 
-		UnlockBufHdr(bufHdr);
+		UnlockBufHdr(bufHdr, state);
 	}
 
 	if (num_to_scan == 0)
@@ -1888,7 +1951,7 @@ BufferSync(int flags)
 		 * write the buffer though we didn't need to.  It doesn't seem worth
 		 * guarding against this, though.
 		 */
-		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
 		{
 			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
 			{
@@ -2258,6 +2321,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 {
 	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 	int			result = 0;
+	uint32		state;
 	BufferTag	tag;
 
 	ReservePrivateRefCountEntry();
@@ -2271,21 +2335,24 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 * don't worry because our checkpoint.redo points before log record for
 	 * upcoming changes and so we are not required to write such dirty buffer.
 	 */
-	LockBufHdr(bufHdr);
+	state = LockBufHdr(bufHdr);
 
-	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
+		BUF_STATE_GET_USAGECOUNT(state) == 0)
+	{
 		result |= BUF_REUSABLE;
+	}
 	else if (skip_recently_used)
 	{
 		/* Caller told us not to write recently-used buffers */
-		UnlockBufHdr(bufHdr);
+		UnlockBufHdr(bufHdr, state);
 		return result;
 	}
 
-	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
+	if (!(state & BM_VALID) || !(state & BM_DIRTY))
 	{
 		/* It's clean, so nothing to do */
-		UnlockBufHdr(bufHdr);
+		UnlockBufHdr(bufHdr, state);
 		return result;
 	}
 
@@ -2439,6 +2506,7 @@ PrintBufferLeakWarning(Buffer buffer)
 	int32		loccount;
 	char	   *path;
 	BackendId	backend;
+	uint32		state;
 
 	Assert(BufferIsValid(buffer));
 	if (BufferIsLocal(buffer))
@@ -2456,12 +2524,13 @@ PrintBufferLeakWarning(Buffer buffer)
 
 	/* theoretically we should lock the bufhdr here */
 	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+	state = pg_atomic_read_u32(&buf->state);
 	elog(WARNING,
 		 "buffer refcount leak: [%03d] "
 		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
 		 buffer, path,
-		 buf->tag.blockNum, buf->flags,
-		 buf->refcount, loccount);
+		 buf->tag.blockNum, state & BUF_FLAG_MASK,
+		 BUF_STATE_GET_REFCOUNT(state), loccount);
 	pfree(path);
 }
 
@@ -2573,6 +2642,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 				io_time;
 	Block		bufBlock;
 	char	   *bufToWrite;
+	uint32		state;
 
 	/*
 	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
@@ -2598,7 +2668,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 										reln->smgr_rnode.node.dbNode,
 										reln->smgr_rnode.node.relNode);
 
-	LockBufHdr(buf);
+	state = LockBufHdr(buf);
 
 	/*
 	 * Run PageGetLSN while holding header lock, since we don't have the
@@ -2607,8 +2677,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	recptr = BufferGetLSN(buf);
 
 	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	buf->flags &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf);
+	state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(buf, state);
 
 	/*
 	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
@@ -2627,7 +2697,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	 * disastrous system-wide consequences.  To make sure that can't happen,
 	 * skip the flush if the buffer isn't permanent.
 	 */
-	if (buf->flags & BM_PERMANENT)
+	if (state & BM_PERMANENT)
 		XLogFlush(recptr);
 
 	/*
@@ -2716,12 +2786,12 @@ BufferIsPermanent(Buffer buffer)
 	/*
 	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
 	 * need not bother with the buffer header spinlock.  Even if someone else
-	 * changes the buffer header flags while we're doing this, we assume that
-	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
-	 * old value or the new value, but not random garbage.
+	 * changes the buffer header state while we're doing this, changing of
+	 * state is atomic, so we'll read the old value or the new value, but not
+	 * random garbage.
 	 */
 	bufHdr = GetBufferDescriptor(buffer - 1);
-	return (bufHdr->flags & BM_PERMANENT) != 0;
+	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
 }
 
 /*
@@ -2736,6 +2806,7 @@ BufferGetLSNAtomic(Buffer buffer)
 	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
 	char	   *page = BufferGetPage(buffer);
 	XLogRecPtr	lsn;
+	uint32		state;
 
 	/*
 	 * If we don't need locking for correctness, fastpath out.
@@ -2747,9 +2818,9 @@ BufferGetLSNAtomic(Buffer buffer)
 	Assert(BufferIsValid(buffer));
 	Assert(BufferIsPinned(buffer));
 
-	LockBufHdr(bufHdr);
+	state = LockBufHdr(bufHdr);
 	lsn = PageGetLSN(page);
-	UnlockBufHdr(bufHdr);
+	UnlockBufHdr(bufHdr, state);
 
 	return lsn;
 }
@@ -2797,6 +2868,7 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
 	for (i = 0; i < NBuffers; i++)
 	{
 		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		state;
 
 		/*
 		 * We can make this a tad faster by prechecking the buffer tag before
@@ -2817,13 +2889,13 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
 		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
 			continue;
 
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
 			InvalidateBuffer(bufHdr);	/* releases spinlock */
 		else
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 	}
 }
 
@@ -2887,6 +2959,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 	{
 		RelFileNode *rnode = NULL;
 		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		state;
 
 		/*
 		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
@@ -2917,11 +2990,11 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 		if (rnode == NULL)
 			continue;
 
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
 			InvalidateBuffer(bufHdr);	/* releases spinlock */
 		else
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 	}
 
 	pfree(nodes);
@@ -2951,6 +3024,7 @@ DropDatabaseBuffers(Oid dbid)
 	for (i = 0; i < NBuffers; i++)
 	{
 		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		state;
 
 		/*
 		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
@@ -2959,11 +3033,11 @@ DropDatabaseBuffers(Oid dbid)
 		if (bufHdr->tag.rnode.dbNode != dbid)
 			continue;
 
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid)
 			InvalidateBuffer(bufHdr);	/* releases spinlock */
 		else
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 	}
 }
 
@@ -3055,9 +3129,12 @@ FlushRelationBuffers(Relation rel)
 	{
 		for (i = 0; i < NLocBuffer; i++)
 		{
+			uint32		state;
+
 			bufHdr = GetLocalBufferDescriptor(i);
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+				((state = pg_atomic_read_u32(&bufHdr->state)) &
+				 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 			{
 				ErrorContextCallback errcallback;
 				Page		localpage;
@@ -3078,7 +3155,8 @@ FlushRelationBuffers(Relation rel)
 						  localpage,
 						  false);
 
-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				pg_atomic_write_u32(&bufHdr->state, state);
 
 				/* Pop the error context stack */
 				error_context_stack = errcallback.previous;
@@ -3093,6 +3171,8 @@ FlushRelationBuffers(Relation rel)
 
 	for (i = 0; i < NBuffers; i++)
 	{
+		uint32		state;
+
 		bufHdr = GetBufferDescriptor(i);
 
 		/*
@@ -3104,9 +3184,9 @@ FlushRelationBuffers(Relation rel)
 
 		ReservePrivateRefCountEntry();
 
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
@@ -3115,7 +3195,7 @@ FlushRelationBuffers(Relation rel)
 			UnpinBuffer(bufHdr, true);
 		}
 		else
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 	}
 }
 
@@ -3145,6 +3225,8 @@ FlushDatabaseBuffers(Oid dbid)
 
 	for (i = 0; i < NBuffers; i++)
 	{
+		uint32		state;
+
 		bufHdr = GetBufferDescriptor(i);
 
 		/*
@@ -3156,9 +3238,9 @@ FlushDatabaseBuffers(Oid dbid)
 
 		ReservePrivateRefCountEntry();
 
-		LockBufHdr(bufHdr);
+		state = LockBufHdr(bufHdr);
 		if (bufHdr->tag.rnode.dbNode == dbid &&
-			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
+			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 		{
 			PinBuffer_Locked(bufHdr);
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
@@ -3167,7 +3249,7 @@ FlushDatabaseBuffers(Oid dbid)
 			UnpinBuffer(bufHdr, true);
 		}
 		else
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 	}
 }
 
@@ -3297,12 +3379,13 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	 * is only intended to be used in cases where failing to write out the
 	 * data would be harmless anyway, it doesn't really matter.
 	 */
-	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
 		XLogRecPtr	lsn = InvalidXLogRecPtr;
 		bool		dirtied = false;
 		bool		delayChkpt = false;
+		uint32		state;
 
 		/*
 		 * If we need to protect hint bit updates from torn writes, WAL-log a
@@ -3313,7 +3396,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		 * We don't check full_page_writes here because that logic is included
 		 * when we call XLogInsert() since the value changes dynamically.
 		 */
-		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
+		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
 			 * If we're in recovery we cannot dirty a page because of a hint.
@@ -3352,9 +3435,11 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			lsn = XLogSaveBufferForHint(buffer, buffer_std);
 		}
 
-		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (!(bufHdr->flags & BM_DIRTY))
+		state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+
+		if (!(state & BM_DIRTY))
 		{
 			dirtied = true;		/* Means "will be dirtied by this action" */
 
@@ -3374,8 +3459,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			if (!XLogRecPtrIsInvalid(lsn))
 				PageSetLSN(page, lsn);
 		}
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-		UnlockBufHdr(bufHdr);
+
+		state |= BM_DIRTY | BM_JUST_DIRTIED;
+		UnlockBufHdr(bufHdr, state);
 
 		if (delayChkpt)
 			MyPgXact->delayChkpt = false;
@@ -3406,17 +3492,19 @@ UnlockBuffers(void)
 
 	if (buf)
 	{
-		LockBufHdr(buf);
+		uint32		state;
+
+		state = LockBufHdr(buf);
 
 		/*
 		 * Don't complain if flag bit not set; it could have been reset but we
 		 * got a cancel/die interrupt before getting the signal.
 		 */
-		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
 			buf->wait_backend_pid == MyProcPid)
-			buf->flags &= ~BM_PIN_COUNT_WAITER;
+			state &= ~BM_PIN_COUNT_WAITER;
 
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 
 		PinCountWaitBuf = NULL;
 	}
@@ -3509,27 +3597,30 @@ LockBufferForCleanup(Buffer buffer)
 
 	for (;;)
 	{
+		uint32		state;
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (bufHdr->refcount == 1)
+		state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
+		if (BUF_STATE_GET_REFCOUNT(state) == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 			return;
 		}
 		/* Failed, so mark myself as waiting for pincount 1 */
-		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
+		if (state & BM_PIN_COUNT_WAITER)
 		{
-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, state);
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			elog(ERROR, "multiple backends attempting to wait for pincount 1");
 		}
 		bufHdr->wait_backend_pid = MyProcPid;
-		bufHdr->flags |= BM_PIN_COUNT_WAITER;
 		PinCountWaitBuf = bufHdr;
-		UnlockBufHdr(bufHdr);
+		state |= BM_PIN_COUNT_WAITER;
+		UnlockBufHdr(bufHdr, state);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
 		/* Report the wait */
@@ -3558,11 +3649,11 @@ LockBufferForCleanup(Buffer buffer)
 		 * impossible with the current usages due to table level locking, but
 		 * better be safe.
 		 */
-		LockBufHdr(bufHdr);
-		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		state = LockBufHdr(bufHdr);
+		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
 			bufHdr->wait_backend_pid == MyProcPid)
-			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
-		UnlockBufHdr(bufHdr);
+			state &= ~BM_PIN_COUNT_WAITER;
+		UnlockBufHdr(bufHdr, state);
 
 		PinCountWaitBuf = NULL;
 		/* Loop back and try again */
@@ -3603,22 +3694,26 @@ bool
 ConditionalLockBufferForCleanup(Buffer buffer)
 {
 	BufferDesc *bufHdr;
+	uint32		state,
+				refcount;
 
 	Assert(BufferIsValid(buffer));
 
 	if (BufferIsLocal(buffer))
 	{
+		refcount = LocalRefCount[-buffer - 1];
 		/* There should be exactly one pin */
-		Assert(LocalRefCount[-buffer - 1] > 0);
-		if (LocalRefCount[-buffer - 1] != 1)
+		Assert(refcount > 0);
+		if (refcount != 1)
 			return false;
 		/* Nobody else to wait for */
 		return true;
 	}
 
 	/* There should be exactly one local pin */
-	Assert(GetPrivateRefCount(buffer) > 0);
-	if (GetPrivateRefCount(buffer) != 1)
+	refcount = GetPrivateRefCount(buffer);
+	Assert(refcount);
+	if (refcount != 1)
 		return false;
 
 	/* Try to acquire lock */
@@ -3626,17 +3721,19 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 		return false;
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
-	LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
-	if (bufHdr->refcount == 1)
+	state = LockBufHdr(bufHdr);
+	refcount = BUF_STATE_GET_REFCOUNT(state);
+
+	Assert(refcount > 0);
+	if (refcount == 1)
 	{
 		/* Successfully acquired exclusive lock with pincount 1 */
-		UnlockBufHdr(bufHdr);
+		UnlockBufHdr(bufHdr, state);
 		return true;
 	}
 
 	/* Failed, so release the lock */
-	UnlockBufHdr(bufHdr);
+	UnlockBufHdr(bufHdr, state);
 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 	return false;
 }
@@ -3666,17 +3763,17 @@ WaitIO(BufferDesc *buf)
 	 */
 	for (;;)
 	{
-		BufFlags	sv_flags;
+		uint32		state;
 
 		/*
 		 * It may not be necessary to acquire the spinlock to check the flag
 		 * here, but since this test is essential for correctness, we'd better
 		 * play it safe.
 		 */
-		LockBufHdr(buf);
-		sv_flags = buf->flags;
-		UnlockBufHdr(buf);
-		if (!(sv_flags & BM_IO_IN_PROGRESS))
+		state = LockBufHdr(buf);
+		UnlockBufHdr(buf, state);
+
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
 		LWLockRelease(BufferDescriptorGetIOLock(buf));
@@ -3704,6 +3801,8 @@ WaitIO(BufferDesc *buf)
 static bool
 StartBufferIO(BufferDesc *buf, bool forInput)
 {
+	uint32		state;
+
 	Assert(!InProgressBuf);
 
 	for (;;)
@@ -3714,9 +3813,9 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 */
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
-		LockBufHdr(buf);
+		state = LockBufHdr(buf);
 
-		if (!(buf->flags & BM_IO_IN_PROGRESS))
+		if (!(state & BM_IO_IN_PROGRESS))
 			break;
 
 		/*
@@ -3725,24 +3824,23 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 		 * an error (see AbortBufferIO).  If that's the case, we must wait for
 		 * him to get unwedged.
 		 */
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		WaitIO(buf);
 	}
 
 	/* Once we get here, there is definitely no I/O active on this buffer */
 
-	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
+	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
 	{
 		/* someone else already did the I/O */
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		return false;
 	}
 
-	buf->flags |= BM_IO_IN_PROGRESS;
-
-	UnlockBufHdr(buf);
+	state |= BM_IO_IN_PROGRESS;
+	UnlockBufHdr(buf, state);
 
 	InProgressBuf = buf;
 	IsForInput = forInput;
@@ -3768,19 +3866,22 @@ StartBufferIO(BufferDesc *buf, bool forInput)
  * be 0, or BM_VALID if we just finished reading in the page.
  */
 static void
-TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
+TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 {
+	uint32		state;
+
 	Assert(buf == InProgressBuf);
 
-	LockBufHdr(buf);
+	state = LockBufHdr(buf);
 
-	Assert(buf->flags & BM_IO_IN_PROGRESS);
-	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
-	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
-	buf->flags |= set_flag_bits;
+	Assert(state & BM_IO_IN_PROGRESS);
 
-	UnlockBufHdr(buf);
+	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+	if (clear_dirty && !(state & BM_JUST_DIRTIED))
+		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+
+	state |= set_flag_bits;
+	UnlockBufHdr(buf, state);
 
 	InProgressBuf = NULL;
 
@@ -3803,6 +3904,8 @@ AbortBufferIO(void)
 
 	if (buf)
 	{
+		uint32		state;
+
 		/*
 		 * Since LWLockReleaseAll has already been called, we're not holding
 		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
@@ -3811,24 +3914,22 @@ AbortBufferIO(void)
 		 */
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
-		LockBufHdr(buf);
-		Assert(buf->flags & BM_IO_IN_PROGRESS);
+		state = LockBufHdr(buf);
+		Assert(state & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
-			Assert(!(buf->flags & BM_DIRTY));
+			Assert(!(state & BM_DIRTY));
+
 			/* We'd better not think buffer is valid yet */
-			Assert(!(buf->flags & BM_VALID));
-			UnlockBufHdr(buf);
+			Assert(!(state & BM_VALID));
+			UnlockBufHdr(buf, state);
 		}
 		else
 		{
-			BufFlags	sv_flags;
-
-			sv_flags = buf->flags;
-			Assert(sv_flags & BM_DIRTY);
-			UnlockBufHdr(buf);
+			Assert(state & BM_DIRTY);
+			UnlockBufHdr(buf, state);
 			/* Issue notice if this is not the first failure... */
-			if (sv_flags & BM_IO_ERROR)
+			if (state & BM_IO_ERROR)
 			{
 				/* Buffer is pinned, so we can read tag without spinlock */
 				char	   *path;
@@ -3912,6 +4013,54 @@ rnode_comparator(const void *p1, const void *p2)
 }
 
 /*
+ * Lock buffer header - set BM_LOCKED in buffer state.
+ */
+uint32
+LockBufHdr(BufferDesc *desc)
+{
+	SpinDelayStatus delayStatus = init_spin_delay(desc);
+	uint32		oldstate;
+
+	while (true)
+	{
+		/* set BM_LOCKED flag */
+		oldstate = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
+		/* if it wasn't set before we're OK */
+		if (!(oldstate & BM_LOCKED))
+			break;
+		perform_spin_delay(&delayStatus);
+	}
+	finish_spin_delay(&delayStatus);
+	return oldstate | BM_LOCKED;
+}
+
+/*
+ * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
+ * state at that point.
+ *
+ * Obviously the buffer could be locked by the time the value is returned, so
+ * this is primarily useful in CAS style loops.
+ */
+static uint32
+WaitBufHdrUnlocked(BufferDesc *buf)
+{
+	SpinDelayStatus delayStatus = init_spin_delay(buf);
+	uint32		state;
+
+	state = pg_atomic_read_u32(&buf->state);
+
+	while (state & BM_LOCKED)
+	{
+		perform_spin_delay(&delayStatus);
+		state = pg_atomic_read_u32(&buf->state);
+	}
+
+	finish_spin_delay(&delayStatus);
+
+	return state;
+}
+
+/*
  * BufferTag comparator.
  */
 static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 551d152..137eb2f 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -98,7 +98,8 @@ typedef struct BufferAccessStrategyData
 
 
 /* Prototypes for internal functions */
-static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
+static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
+				  uint32 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 				BufferDesc *buf);
 
@@ -180,7 +181,7 @@ ClockSweepTick(void)
  *	return the buffer with the buffer header spinlock still held.
  */
 BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy)
+StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	int			bgwprocno;
@@ -192,7 +193,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	 */
 	if (strategy != NULL)
 	{
-		buf = GetBufferFromRing(strategy);
+		buf = GetBufferFromRing(strategy, buf_state);
 		if (buf != NULL)
 			return buf;
 	}
@@ -250,6 +251,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	{
 		while (true)
 		{
+			uint32		state;
+
 			/* Acquire the spinlock to remove element from the freelist */
 			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
@@ -279,14 +282,16 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * it before we got to it.  It's probably impossible altogether as
 			 * of 8.3, but we'd better check anyway.)
 			 */
-			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(state) == 0
+				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*buf_state = state;
 				return buf;
 			}
-			UnlockBufHdr(buf);
+			UnlockBufHdr(buf, state);
 
 		}
 	}
@@ -295,6 +300,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	trycounter = NBuffers;
 	for (;;)
 	{
+		uint32		state;
 
 		buf = GetBufferDescriptor(ClockSweepTick());
 
@@ -302,12 +308,14 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0)
+		state = LockBufHdr(buf);
+
+		if (BUF_STATE_GET_REFCOUNT(state) == 0)
 		{
-			if (buf->usage_count > 0)
+			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
 			{
-				buf->usage_count--;
+				state -= BUF_USAGECOUNT_ONE;
+
 				trycounter = NBuffers;
 			}
 			else
@@ -315,6 +323,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 				/* Found a usable buffer */
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*buf_state = state;
 				return buf;
 			}
 		}
@@ -327,10 +336,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * probably better to fail than to risk getting stuck in an
 			 * infinite loop.
 			 */
-			UnlockBufHdr(buf);
+			UnlockBufHdr(buf, state);
 			elog(ERROR, "no unpinned buffers available");
 		}
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, state);
 	}
 }
 
@@ -585,10 +594,11 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
  * The bufhdr spin lock is held on the returned buffer.
  */
 static BufferDesc *
-GetBufferFromRing(BufferAccessStrategy strategy)
+GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		state;
 
 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
@@ -616,13 +626,15 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 * shouldn't re-use it.
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
-	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	state = LockBufHdr(buf);
+	if (BUF_STATE_GET_REFCOUNT(state) == 0
+		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
 	{
 		strategy->current_was_in_ring = true;
+		*buf_state = state;
 		return buf;
 	}
-	UnlockBufHdr(buf);
+	UnlockBufHdr(buf, state);
 
 	/*
 	 * Tell caller to allocate a new buffer with the normal allocation
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 17640cf..edc0ada 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -108,6 +108,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	int			b;
 	int			trycounter;
 	bool		found;
+	uint32		state;
 
 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
@@ -128,16 +129,21 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
 				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
 #endif
+		state = pg_atomic_read_u32(&bufHdr->state);
+
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
+			{
+				state += BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
+			}
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
-		if (bufHdr->flags & BM_VALID)
+		if (state & BM_VALID)
 			*foundPtr = TRUE;
 		else
 		{
@@ -169,9 +175,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			state = pg_atomic_read_u32(&bufHdr->state);
+
+			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
 			{
-				bufHdr->usage_count--;
+				state -= BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, state);
 				trycounter = NLocBuffer;
 			}
 			else
@@ -193,7 +202,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * this buffer is not referenced but it might still be dirty. if that's
 	 * the case, write it out before reusing it!
 	 */
-	if (bufHdr->flags & BM_DIRTY)
+	if (state & BM_DIRTY)
 	{
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
@@ -211,7 +220,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 				  false);
 
 		/* Mark not-dirty now in case we error out below */
-		bufHdr->flags &= ~BM_DIRTY;
+		state &= ~BM_DIRTY;
+		pg_atomic_write_u32(&bufHdr->state, state);
 
 		pgBufferUsage.local_blks_written++;
 	}
@@ -228,7 +238,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	/*
 	 * Update the hash table: remove old entry, if any, and make new one.
 	 */
-	if (bufHdr->flags & BM_TAG_VALID)
+	if (state & BM_TAG_VALID)
 	{
 		hresult = (LocalBufferLookupEnt *)
 			hash_search(LocalBufHash, (void *) &bufHdr->tag,
@@ -237,7 +247,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			elog(ERROR, "local buffer hash table corrupted");
 		/* mark buffer invalid just in case hash insert fails */
 		CLEAR_BUFFERTAG(bufHdr->tag);
-		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
+		state &= ~(BM_VALID | BM_TAG_VALID);
+		pg_atomic_write_u32(&bufHdr->state, state);
 	}
 
 	hresult = (LocalBufferLookupEnt *)
@@ -250,9 +261,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * it's all ours now.
 	 */
 	bufHdr->tag = newTag;
-	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	state |= BM_TAG_VALID;
+	state &= ~BUF_USAGECOUNT_MASK;
+	state += BUF_USAGECOUNT_ONE;
+	pg_atomic_write_u32(&bufHdr->state, state);
 
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -267,6 +280,7 @@ MarkLocalBufferDirty(Buffer buffer)
 {
 	int			bufid;
 	BufferDesc *bufHdr;
+	uint32		state;
 
 	Assert(BufferIsLocal(buffer));
 
@@ -280,10 +294,10 @@ MarkLocalBufferDirty(Buffer buffer)
 
 	bufHdr = GetLocalBufferDescriptor(bufid);
 
-	if (!(bufHdr->flags & BM_DIRTY))
-		pgBufferUsage.local_blks_dirtied++;
+	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
 
-	bufHdr->flags |= BM_DIRTY;
+	if (!(state & BM_DIRTY))
+		pgBufferUsage.local_blks_dirtied++;
 }
 
 /*
@@ -307,8 +321,11 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
@@ -327,8 +344,9 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
@@ -349,8 +367,11 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		state;
 
-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
 		{
 			if (LocalRefCount[i] != 0)
@@ -367,8 +388,9 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			state &= ~BUF_FLAG_MASK;
+			state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, state);
 		}
 	}
 }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index cc0bf5e..4a6ffb4 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -3,6 +3,38 @@
  * s_lock.c
  *	   Hardware-dependent implementation of spinlocks.
  *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again.  Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held.  100
+ * iterations seems about right as an initial guess.  However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure.  This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again.  The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock.  (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.)  But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries).  With the given settings, this will usually take 2 or
+ * so minutes.  It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
  *
  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -21,6 +53,14 @@
 #include "storage/s_lock.h"
 #include "storage/barrier.h"
 
+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS			1000
+#define MIN_DELAY_USEC		1000L
+#define MAX_DELAY_USEC		1000000L
+
+
 slock_t		dummy_spinlock;
 
 static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
@@ -30,128 +70,35 @@ static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
  * s_lock_stuck() - complain about a stuck spinlock
  */
 static void
-s_lock_stuck(volatile slock_t *lock, const char *file, int line)
+s_lock_stuck(void *p, const char *file, int line)
 {
 #if defined(S_LOCK_TEST)
 	fprintf(stderr,
 			"\nStuck spinlock (%p) detected at %s:%d.\n",
-			lock, file, line);
+			p, file, line);
 	exit(1);
 #else
 	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
-		 lock, file, line);
+		 p, file, line);
 #endif
 }
 
-
 /*
  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
  */
 int
 s_lock(volatile slock_t *lock, const char *file, int line)
 {
-	/*
-	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
-	 * Preferably, "awhile" should be a small multiple of the maximum time we
-	 * expect a spinlock to be held.  100 iterations seems about right as an
-	 * initial guess.  However, on a uniprocessor the loop is a waste of
-	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
-	 * longer than to call the kernel, so we try to adapt the spin loop count
-	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
-	 *
-	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
-	 * be wrong; there are platforms where that can result in a "stuck
-	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
-	 * that the first TAS after returning from kernel space will always fail
-	 * on that hardware.
-	 *
-	 * Once we do decide to block, we use randomly increasing pg_usleep()
-	 * delays. The first delay is 1 msec, then the delay randomly increases to
-	 * about one second, after which we reset to 1 msec and start again.  The
-	 * idea here is that in the presence of heavy contention we need to
-	 * increase the delay, else the spinlock holder may never get to run and
-	 * release the lock.  (Consider situation where spinlock holder has been
-	 * nice'd down in priority by the scheduler --- it will not get scheduled
-	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
-	 * sleep, there is a real possibility of starvation.)  But we can't just
-	 * clamp the delay to an upper bound, else it would take a long time to
-	 * make a reasonable number of tries.
-	 *
-	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
-	 * that many tries).  With the given settings, this will usually take 2 or
-	 * so minutes.  It seems better to fix the total number of tries (and thus
-	 * the probability of unintended failure) than to fix the total time
-	 * spent.
-	 */
-#define MIN_SPINS_PER_DELAY 10
-#define MAX_SPINS_PER_DELAY 1000
-#define NUM_DELAYS			1000
-#define MIN_DELAY_USEC		1000L
-#define MAX_DELAY_USEC		1000000L
-
-	int			spins = 0;
-	int			delays = 0;
-	int			cur_delay = 0;
+	SpinDelayStatus delayStatus = init_spin_delay((void *) lock);
 
 	while (TAS_SPIN(lock))
 	{
-		/* CPU-specific delay each time through the loop */
-		SPIN_DELAY();
-
-		/* Block the process every spins_per_delay tries */
-		if (++spins >= spins_per_delay)
-		{
-			if (++delays > NUM_DELAYS)
-				s_lock_stuck(lock, file, line);
-
-			if (cur_delay == 0) /* first time to delay? */
-				cur_delay = MIN_DELAY_USEC;
-
-			pg_usleep(cur_delay);
-
-#if defined(S_LOCK_TEST)
-			fprintf(stdout, "*");
-			fflush(stdout);
-#endif
-
-			/* increase delay by a random fraction between 1X and 2X */
-			cur_delay += (int) (cur_delay *
-					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
-			/* wrap back to minimum delay when max is exceeded */
-			if (cur_delay > MAX_DELAY_USEC)
-				cur_delay = MIN_DELAY_USEC;
-
-			spins = 0;
-		}
+		perform_spin_delay(&delayStatus);
 	}
 
-	/*
-	 * If we were able to acquire the lock without delaying, it's a good
-	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
-	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
-	 * decrement spins_per_delay slowly when we had to delay, and increase it
-	 * rapidly when we didn't.  It's expected that spins_per_delay will
-	 * converge to the minimum value on a uniprocessor and to the maximum
-	 * value on a multiprocessor.
-	 *
-	 * Note: spins_per_delay is local within our current process. We want to
-	 * average these observations across multiple backends, since it's
-	 * relatively rare for this function to even get entered, and so a single
-	 * backend might not live long enough to converge on a good value.  That
-	 * is handled by the two routines below.
-	 */
-	if (cur_delay == 0)
-	{
-		/* we never had to delay */
-		if (spins_per_delay < MAX_SPINS_PER_DELAY)
-			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
-	}
-	else
-	{
-		if (spins_per_delay > MIN_SPINS_PER_DELAY)
-			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
-	}
-	return delays;
+	finish_spin_delay(&delayStatus);
+
+	return delayStatus.delays;
 }
 
 #ifdef USE_DEFAULT_S_UNLOCK
@@ -168,6 +115,75 @@ s_unlock(volatile slock_t *lock)
 #endif
 
 /*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+	/* CPU-specific delay each time through the loop */
+	SPIN_DELAY();
+
+	/* Block the process every spins_per_delay tries */
+	if (++(status->spins) >= spins_per_delay)
+	{
+		if (++(status->delays) > NUM_DELAYS)
+			s_lock_stuck(status->ptr, status->file, status->line);
+
+		if (status->cur_delay == 0)		/* first time to delay? */
+			status->cur_delay = MIN_DELAY_USEC;
+
+		pg_usleep(status->cur_delay);
+
+#if defined(S_LOCK_TEST)
+		fprintf(stdout, "*");
+		fflush(stdout);
+#endif
+
+		/* increase delay by a random fraction between 1X and 2X */
+		status->cur_delay += (int) (status->cur_delay *
+					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
+		/* wrap back to minimum delay when max is exceeded */
+		if (status->cur_delay > MAX_DELAY_USEC)
+			status->cur_delay = MIN_DELAY_USEC;
+
+		status->spins = 0;
+	}
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor.  If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't.  It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value.  That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+	if (status->cur_delay == 0)
+	{
+		/* we never had to delay */
+		if (spins_per_delay < MAX_SPINS_PER_DELAY)
+			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+	}
+	else
+	{
+		if (spins_per_delay > MIN_SPINS_PER_DELAY)
+			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+	}
+}
+
+/*
  * Set local copy of spins_per_delay during backend startup.
  *
  * NB: this has to be pretty fast as it is called while holding a spinlock
diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h
index 61e3060..b2d7776 100644
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@@ -63,12 +63,15 @@ extern void ShmemBackendArrayAllocation(void);
 #endif
 
 /*
- * Note: MAX_BACKENDS is limited to 2^23-1 because inval.c stores the
- * backend ID as a 3-byte signed integer.  Even if that limitation were
- * removed, we still could not exceed INT_MAX/4 because some places compute
- * 4*MaxBackends without any overflow check.  This is rechecked in the relevant
- * GUC check hooks and in RegisterBackgroundWorker().
+ * Note: MAX_BACKENDS is limited to 2^18-1 because that's the width reserved
+ * for buffer references in buf_internals.h.  This limitation could be lifted
+ * by using a 64bit state; but it's unlikely to be worthwhile as 2^18-1
+ * backends exceed currently realistic configurations. Even if that limitation
+ * were removed, we still could not a) exceed 2^23-1 because inval.c stores
+ * the backend ID as a 3-byte signed integer, b) INT_MAX/4 because some places
+ * compute 4*MaxBackends without any overflow check.  This is rechecked in the
+ * relevant GUC check hooks and in RegisterBackgroundWorker().
  */
-#define MAX_BACKENDS	0x7fffff
+#define MAX_BACKENDS	0x3FFFF
 
 #endif   /* _POSTMASTER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index d04363b..625a389 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -21,29 +21,51 @@
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
+#include "port/atomics.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
 
 
 /*
+ * Buffer state is a single 32-bit variable where following data is combined.
+ *
+ * - 18 bits refcount
+ * - 4 bits usage count
+ * - 10 bits of flags
+ *
+ * Combining these values allows to perform some operations without locking
+ * the buffer header, by modifying them together with a CAS loop.
+ *
+ * The definition of buffer state components is below.
+ */
+#define BUF_REFCOUNT_ONE 1
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+#define BUF_FLAG_MASK 0xFFC00000U
+
+/* Get refcount and usagecount from buffer state */
+#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+
+/*
  * Flags for buffer descriptors
  *
  * Note: TAG_VALID essentially means that there is a buffer hashtable
  * entry associated with the buffer's tag.
  */
-#define BM_DIRTY				(1 << 0)		/* data needs writing */
-#define BM_VALID				(1 << 1)		/* data is valid */
-#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
-#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
-#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
-#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
-#define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
-#define BM_PERMANENT			(1 << 8)		/* permanent relation (not
+#define BM_LOCKED				(1U << 22)		/* buffer header is locked */
+#define BM_DIRTY				(1U << 23)		/* data needs writing */
+#define BM_VALID				(1U << 24)		/* data is valid */
+#define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
+#define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
+#define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
+#define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
+#define BM_PERMANENT			(1U << 31)		/* permanent relation (not
 												 * unlogged) */
-
-typedef bits16 BufFlags;
-
 /*
  * The maximum allowed value of usage_count represents a tradeoff between
  * accuracy and speed of the clock-sweep buffer management algorithm.  A
@@ -113,18 +135,29 @@ typedef struct buftag
 /*
  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
  *
- * Note: buf_hdr_lock must be held to examine or change the tag, flags,
- * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
- * changes after initialization, so does not need locking.  freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
- * take care of itself.  The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
+ * the tag, state or wait_backend_pid fields.  In general, buffer header lock
+ * is a spinlock which is combined with flags, refcount and usagecount into
+ * single atomic variable.  This layout allow us to do some operations in a
+ * single atomic operation, without actually acquiring and releasing spinlock;
+ * for instance, increase or decrease refcount.  buf_id field never changes
+ * after initialization, so does not need locking.  freeNext is protected by
+ * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
+ * of itself.  The buffer header lock is *not* used to control access to the
+ * data in the buffer!
+ *
+ * It's assumed that nobody changes the state field while buffer header lock
+ * is held.  Thus buffer header lock holder can do complex updates of the
+ * state variable in single write, simultaneously with lock release (cleaning
+ * BM_LOCKED flag).  On the other hand, updating of state without holding
+ * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
+ * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
  *
  * An exception is that if we have the buffer pinned, its tag can't change
- * underneath us, so we can examine the tag without locking the spinlock.
+ * underneath us, so we can examine the tag without locking the buffer header.
  * Also, in places we do one-time reads of the flags without bothering to
- * lock the spinlock; this is generally for situations where we don't expect
- * the flag bit being tested to be changing.
+ * lock the buffer header; this is generally for situations where we don't
+ * expect the flag bit being tested to be changing.
  *
  * We can't physically remove items from a disk page if another backend has
  * the buffer pinned.  Hence, a backend may need to wait for all other pins
@@ -142,13 +175,12 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint8		usage_count;	/* usage counter for clock sweep code */
-	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
-	int			wait_backend_pid;		/* backend PID of pin-count waiter */
-
 	int			buf_id;			/* buffer's index number (from 0) */
+
+	/* state of the tag, containing flags, refcount and usagecount */
+	pg_atomic_uint32 state;
+
+	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 	int			freeNext;		/* link in freelist chain */
 
 	LWLock		content_lock;	/* to lock access to buffer contents */
@@ -202,11 +234,15 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 #define FREENEXT_NOT_IN_LIST	(-2)
 
 /*
- * Macros for acquiring/releasing a shared buffer header's spinlock.
- * Do not apply these to local buffers!
+ * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
+ * not apply these to local buffers!
  */
-#define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
-#define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+extern uint32 LockBufHdr(BufferDesc *desc);
+#define UnlockBufHdr(desc, s)	\
+	do {	\
+		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
+		pg_write_barrier(); \
+	} while (0)
 
 
 /*
@@ -267,7 +303,8 @@ extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
 
 /* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+											  uint32 *state);
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 8b240cd..11410e2 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -991,4 +991,22 @@ extern int s_lock(volatile slock_t *lock, const char *file, int line);
 extern void set_spins_per_delay(int shared_spins_per_delay);
 extern int	update_spins_per_delay(int shared_spins_per_delay);
 
+/*
+ * Support for spin delay which is useful in various places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+	int			spins;
+	int			delays;
+	int			cur_delay;
+	void	   *ptr;
+	const char *file;
+	int			line;
+} SpinDelayStatus;
+
+#define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+void perform_spin_delay(SpinDelayStatus *status);
+void finish_spin_delay(SpinDelayStatus *status);
+
 #endif	 /* S_LOCK_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e293fc0..cdaad20 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1859,6 +1859,7 @@ SpGistScanOpaqueData
 SpGistState
 SpGistTypeDesc
 SpecialJoinInfo
+SpinDelayStatus
 SplitInterval
 SplitLR
 SplitVar
-- 
2.7.0.229.g701fa7f

#134Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#133)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Apr 8, 2016 at 7:39 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-07 16:50:44 +0300, Alexander Korotkov wrote:

On Thu, Apr 7, 2016 at 4:41 PM, Andres Freund <andres@anarazel.de>

wrote:

On 2016-03-31 20:21:02 +0300, Alexander Korotkov wrote:

! BEGIN_BUFSTATE_CAS_LOOP(bufHdr);

! Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! wasDirty = (state & BM_DIRTY) ? true : false;
! state |= BM_DIRTY | BM_JUST_DIRTIED;
! if (state == oldstate)
! break;

I'm doubtful that this early exit is entirely safe. None of the
preceding operations imply a memory barrier. The buffer could

previously

have been marked dirty, but cleaned since. It's pretty critical that we
re-set the dirty bit (there's no danger of loosing it with a barrier,
because we hold an exclusive content lock).

Oh, I get it.

Practically the risk seems fairly low, because acquiring the exclusive
content lock will have implied a barrier. But it seems unlikely to have
a measurable performance effect to me, so I'd rather not add the early
exit.

Ok, let's just remove it.

Here's my updated version of the patch. I've updated this on an
intercontinental flight, after a otherwise hectic week (moving from SF
to Berlin); so I'm planning to look over this once more before pushing (.

Ok.

I've decided that the cas-loop macros are too obfuscating for my

taste. To avoid duplicating the wait part I've introduced
WaitBufHdrUnlocked().

That's OK for me. Cas-loop macros looks cute, but too magic.

As you can see in

http://archives.postgresql.org/message-id/CA%2BTgmoaeRbN%3DZ4oWENLvgGLeHEvGZ_S_Z3KGrdScyKiSvNt3oA%40mail.gmail.com
I'm planning to apply this sometime this weekend, after running some
tests and going over the patch again.

Any chance you could have a look over this?

I took a look at this. Changes you made look good for me.
I also run test on 4x18 Intel server.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#135Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#134)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Apr 8, 2016 at 10:19 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Fri, Apr 8, 2016 at 7:39 PM, Andres Freund <andres@anarazel.de> wrote:

As you can see in

http://archives.postgresql.org/message-id/CA%2BTgmoaeRbN%3DZ4oWENLvgGLeHEvGZ_S_Z3KGrdScyKiSvNt3oA%40mail.gmail.com
I'm planning to apply this sometime this weekend, after running some
tests and going over the patch again.

Any chance you could have a look over this?

I took a look at this. Changes you made look good for me.
I also run test on 4x18 Intel server.

On the top of current master results are following:

clients TPS
1 12562
2 25604
4 52661
8 103209
10 128599
20 256872
30 365718
40 432749
50 513528
60 684943
70 696050
80 923350
90 1119776
100 1208027
110 1229429
120 1163356
130 1107924
140 1084344
150 1014064
160 961730
170 980743
180 968419

The results are quite discouraging because previously we had about 1.5M TPS
in the peak while we have only about 1.2M now. I found that it's not
related to the changes you made in the patch, but it's related to 5364b357
"Increase maximum number of clog buffers". I'm making same benchmark with
5364b357 reverted.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#136Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#135)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Apr 9, 2016 at 11:24 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Fri, Apr 8, 2016 at 10:19 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Fri, Apr 8, 2016 at 7:39 PM, Andres Freund <andres@anarazel.de> wrote:

As you can see in

http://archives.postgresql.org/message-id/CA%2BTgmoaeRbN%3DZ4oWENLvgGLeHEvGZ_S_Z3KGrdScyKiSvNt3oA%40mail.gmail.com
I'm planning to apply this sometime this weekend, after running some
tests and going over the patch again.

Any chance you could have a look over this?

I took a look at this. Changes you made look good for me.
I also run test on 4x18 Intel server.

On the top of current master results are following:

clients TPS
1 12562
2 25604
4 52661
8 103209
10 128599
20 256872
30 365718
40 432749
50 513528
60 684943
70 696050
80 923350
90 1119776
100 1208027
110 1229429
120 1163356
130 1107924
140 1084344
150 1014064
160 961730
170 980743
180 968419

The results are quite discouraging because previously we had about 1.5M
TPS in the peak while we have only about 1.2M now. I found that it's not
related to the changes you made in the patch, but it's related to 5364b357
"Increase maximum number of clog buffers". I'm making same benchmark with
5364b357 reverted.

There are results with 5364b357 reverted.

clients TPS
1 12980
2 27105
4 51969
8 105507
10 132811
20 256888
30 368573
40 467605
50 544231
60 590898
70 799094
80 967569
90 1211662
100 1352427
110 1432561
120 1480324
130 1486624
140 1492092
150 1461681
160 1426733
170 1409081
180 1366199

It's much closer to what we had before.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#137Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#136)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

It's much closer to what we had before.

I'm going to apply this later then. If there's some micro optimization
for large x86, we can look into that later.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#138Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#137)
Re: Move PinBuffer and UnpinBuffer to atomics

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements? Because there really shouldn't be clog lookups one a steady state is reached...

Andres
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#139Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#137)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 1:13 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

What exactly is this test?
I think assuming it is a read-only -M prepared pgbench run where data fits
in shared buffers. However if you can share exact details, then I can try
the similar test.

Crazy that this has such a negative impact. Amit, can you reproduce
that?

I will try it.

Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

I also think so. Alexander, if try read-write workload with unlogged
tables, then we should see an improvement.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#140Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#138)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de> wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run the
script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint bits
are set.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#141Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#139)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 7:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Sun, Apr 10, 2016 at 1:13 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

What exactly is this test?
I think assuming it is a read-only -M prepared pgbench run where data fits
in shared buffers. However if you can share exact details, then I can try
the similar test.

Yes, the test is:

pgbench -s 1000 -c $clients -j 100 -M prepared -S -T 300
(shared_buffers=24GB)

Crazy that this has such a negative impact. Amit, can you reproduce
that?

I will try it.

Good.

Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

I also think so. Alexander, if try read-write workload with unlogged
tables, then we should see an improvement.

I'll try read-write workload.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#142Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#140)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de> wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run the
script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint bits
are set.

I also tried to run perf top during pgbench and get some interesting
results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData consumes
more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#143Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#142)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 11:33 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de>
wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run
the script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint bits
are set.

I also tried to run perf top during pgbench and get some interesting
results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData consumes
more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

There is a related fact presented by Mithun C Y as well [1]/messages/by-id/CAD__Ouic1Tvnwqm6Wf6j7Cz1Kk1DQgmy0isC7=OgX+3JtfGk9g@mail.gmail.com which suggests
that Andres's idea of reducing the cost of snapshot shows noticeable gain
after increasing the clog buffers. If you read that thread you will notice
that initially we didn't notice much gain by that idea, but with increased
clog buffers, it started showing noticeable gain. If by any chance, you
can apply that patch and see the results (latest patch is at [2]/messages/by-id/CAD__OuiwEi5sHe2wwQCK36Ac9QMhvJuqG3CfPN+OFCMb7rdruQ@mail.gmail.com).

[1]: /messages/by-id/CAD__Ouic1Tvnwqm6Wf6j7Cz1Kk1DQgmy0isC7=OgX+3JtfGk9g@mail.gmail.com
/messages/by-id/CAD__Ouic1Tvnwqm6Wf6j7Cz1Kk1DQgmy0isC7=OgX+3JtfGk9g@mail.gmail.com

[2]: /messages/by-id/CAD__OuiwEi5sHe2wwQCK36Ac9QMhvJuqG3CfPN+OFCMb7rdruQ@mail.gmail.com
/messages/by-id/CAD__OuiwEi5sHe2wwQCK36Ac9QMhvJuqG3CfPN+OFCMb7rdruQ@mail.gmail.com

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#144Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#141)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 11:10 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 10, 2016 at 7:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Sun, Apr 10, 2016 at 1:13 AM, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

What exactly is this test?
I think assuming it is a read-only -M prepared pgbench run where data
fits in shared buffers. However if you can share exact details, then I can
try the similar test.

Yes, the test is:

pgbench -s 1000 -c $clients -j 100 -M prepared -S -T 300
(shared_buffers=24GB)

Crazy that this has such a negative impact. Amit, can you reproduce
that?

I will try it.

Good.

Okay, I have done some performance testing of read-only tests with
configuration suggested by you to see the impact

pin_unpin - latest version of pin unpin patch on top of HEAD.
pin_unpin_clog_32 - pin_unpin + change clog buffers to 32

Client_Count/Patch_ver 64 128
pin_unpin 330280 133586
pin_unpin_clog_32 388244 132388

This shows that at 64 client count, the performance is better with 32 clog
buffers. However, I think this is more attributed towards the fact that
contention seems to shifted to procarraylock as to an extent indicated in
Alexandar's mail. I will try once with cache the snapshot patch as well
and with clog buffers as 64.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#145Amit Kapila
amit.kapila16@gmail.com
In reply to: Amit Kapila (#144)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 6:15 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Sun, Apr 10, 2016 at 11:10 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 10, 2016 at 7:26 AM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Sun, Apr 10, 2016 at 1:13 AM, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

What exactly is this test?
I think assuming it is a read-only -M prepared pgbench run where data
fits in shared buffers. However if you can share exact details, then I can
try the similar test.

Yes, the test is:

pgbench -s 1000 -c $clients -j 100 -M prepared -S -T 300
(shared_buffers=24GB)

Crazy that this has such a negative impact. Amit, can you reproduce
that?

I will try it.

Good.

Okay, I have done some performance testing of read-only tests with
configuration suggested by you to see the impact

pin_unpin - latest version of pin unpin patch on top of HEAD.
pin_unpin_clog_32 - pin_unpin + change clog buffers to 32

Client_Count/Patch_ver 64 128
pin_unpin 330280 133586
pin_unpin_clog_32 388244 132388

This shows that at 64 client count, the performance is better with 32 clog
buffers. However, I think this is more attributed towards the fact that
contention seems to shifted to procarraylock as to an extent indicated in
Alexandar's mail. I will try once with cache the snapshot patch as well
and with clog buffers as 64.

I went ahead and tried with Cache the snapshot patch and with clog buffers
as 64 and below is performance data:

Description of patches

pin_unpin - latest version of pin unpin patch on top of HEAD.
pin_unpin_clog_32 - pin_unpin + change clog buffers to 32
pin_unpin_cache_snapshot - pin_unpin + Cache the snapshot
pin_unpin_clog_64 - pin_unpin + change clog buffers to 64

Client_Count/Patch_ver 64 128
pin_unpin 330280 133586
pin_unpin_clog_32 388244 132388
pin_unpin_cache_snapshot 412149 144799
pin_unpin_clog_64 391472 132951

Above data seems to indicate that cache the snapshot patch will make
performance go further up with clog buffers as 128 (HEAD). I will take the
performance data with pin_unpin + clog buffers as 32 + cache the snapshot,
but above seems a good enough indication that making clog buffers as 128 is
a good move considering we will one day improve GetSnapshotData either by
Cache the snapshot technique or some other way. Also making clog buffers
as 64 instead of 128 seems to address the regression (at the very least in
above tests), but for read-write performance, clog buffers as 128 has
better numbers, though the difference between 64 and 128 is not very high.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#146Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#142)
3 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-10 09:03:37 +0300, Alexander Korotkov wrote:

On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de> wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run the
script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint bits
are set.

I also tried to run perf top during pgbench and get some interesting
results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData consumes
more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

Could you retry after applying the attached series of patches?

- Andres

Attachments:

0001-Finish-09adc9a8c09c9640de05c7023b27fb83c761e91c.patchtext/x-patch; charset=us-asciiDownload
From e8ad791c97004c64a1f27a500ba100b69fdc8d87 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 10 Apr 2016 21:47:04 -0700
Subject: [PATCH 1/3] Finish 09adc9a8c09c9640de05c7023b27fb83c761e91c.

---
 src/backend/access/transam/parallel.c |  2 +-
 src/backend/port/sysv_shmem.c         |  2 +-
 src/backend/port/win32_shmem.c        |  2 +-
 src/backend/storage/buffer/buf_init.c | 19 ++++++-------------
 src/backend/storage/ipc/shm_toc.c     |  6 +++---
 src/backend/storage/ipc/shmem.c       | 17 ++++++++++-------
 src/include/c.h                       |  2 --
 src/include/pg_config_manual.h        |  8 --------
 src/include/storage/shm_toc.h         |  2 +-
 9 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 0bba9a7..f8ea89c 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -237,7 +237,7 @@ InitializeParallelDSM(ParallelContext *pcxt)
 		shm_toc_estimate_keys(&pcxt->estimator, 6);
 
 		/* Estimate space need for error queues. */
-		StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
+		StaticAssertStmt(CACHELINEALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
 						 PARALLEL_ERROR_QUEUE_SIZE,
 						 "parallel error queue size not buffer-aligned");
 		shm_toc_estimate_chunk(&pcxt->estimator,
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 6c442b9..084bc31 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -559,7 +559,7 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port,
 	 * Initialize space allocation status for segment.
 	 */
 	hdr->totalsize = size;
-	hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+	hdr->freeoffset = CACHELINEALIGN(sizeof(PGShmemHeader));
 	*shim = hdr;
 
 	/* Save info for possible future use */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 0ff2c7e..81705fc 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -241,7 +241,7 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port,
 	 * Initialize space allocation status for segment.
 	 */
 	hdr->totalsize = size;
-	hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+	hdr->freeoffset = CACHELINEALIGN(sizeof(PGShmemHeader));
 	hdr->dsm_control = 0;
 
 	/* Save info for possible future use */
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index a5cffc7..61f9c34 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -76,11 +76,9 @@ InitBufferPool(void)
 
 	/* Align descriptors to a cacheline boundary. */
 	BufferDescriptors = (BufferDescPadded *)
-		CACHELINEALIGN(
-					   ShmemInitStruct("Buffer Descriptors",
-									   NBuffers * sizeof(BufferDescPadded)
-									   + PG_CACHE_LINE_SIZE,
-									   &foundDescs));
+		ShmemInitStruct("Buffer Descriptors",
+						NBuffers * sizeof(BufferDescPadded),
+						&foundDescs);
 
 	BufferBlocks = (char *)
 		ShmemInitStruct("Buffer Blocks",
@@ -88,10 +86,9 @@ InitBufferPool(void)
 
 	/* Align lwlocks to cacheline boundary */
 	BufferIOLWLockArray = (LWLockMinimallyPadded *)
-		CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
-							  NBuffers * (Size) sizeof(LWLockMinimallyPadded)
-									   + PG_CACHE_LINE_SIZE,
-									   &foundIOLocks));
+		ShmemInitStruct("Buffer IO Locks",
+						NBuffers * (Size) sizeof(LWLockMinimallyPadded),
+						&foundIOLocks);
 
 	BufferIOLWLockTranche.name = "buffer_io";
 	BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
@@ -179,8 +176,6 @@ BufferShmemSize(void)
 
 	/* size of buffer descriptors */
 	size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
-	/* to allow aligning buffer descriptors */
-	size = add_size(size, PG_CACHE_LINE_SIZE);
 
 	/* size of data pages */
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
@@ -197,8 +192,6 @@ BufferShmemSize(void)
 	 * not highly contentended, we lay out the array with minimal padding.
 	 */
 	size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
-	/* to allow aligning the above */
-	size = add_size(size, PG_CACHE_LINE_SIZE);
 
 	/* size of checkpoint sort array in bufmgr.c */
 	size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
index 55248c2..683abc4 100644
--- a/src/backend/storage/ipc/shm_toc.c
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -89,7 +89,7 @@ shm_toc_allocate(shm_toc *toc, Size nbytes)
 	Size		toc_bytes;
 
 	/* Make sure request is well-aligned. */
-	nbytes = BUFFERALIGN(nbytes);
+	nbytes = CACHELINEALIGN(nbytes);
 
 	SpinLockAcquire(&toc->toc_mutex);
 
@@ -133,8 +133,8 @@ shm_toc_freespace(shm_toc *toc)
 	SpinLockRelease(&toc->toc_mutex);
 
 	toc_bytes = offsetof(shm_toc, toc_entry) +nentry * sizeof(shm_toc_entry);
-	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
-	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+	Assert(allocated_bytes + CACHELINEALIGN(toc_bytes) <= total_bytes);
+	return total_bytes - (allocated_bytes + CACHELINEALIGN(toc_bytes));
 }
 
 /*
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 1ad68cd..ae26d4b 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -125,7 +125,7 @@ InitShmemAllocation(void)
 		PGSemaphore spinsemas;
 
 		spinsemas = (PGSemaphore) (((char *) shmhdr) + shmhdr->freeoffset);
-		shmhdr->freeoffset += MAXALIGN(SpinlockSemaSize());
+		shmhdr->freeoffset += CACHELINEALIGN(SpinlockSemaSize());
 		SpinlockSemaInit(spinsemas);
 		Assert(shmhdr->freeoffset <= shmhdr->totalsize);
 	}
@@ -136,7 +136,7 @@ InitShmemAllocation(void)
 	 * way, too, for the same reasons as above.
 	 */
 	ShmemLock = (slock_t *) (((char *) shmhdr) + shmhdr->freeoffset);
-	shmhdr->freeoffset += MAXALIGN(sizeof(slock_t));
+	shmhdr->freeoffset += CACHELINEALIGN(sizeof(slock_t));
 	Assert(shmhdr->freeoffset <= shmhdr->totalsize);
 
 	SpinLockInit(ShmemLock);
@@ -179,7 +179,9 @@ ShmemAlloc(Size size)
 	 * boundary.  The calling code will still need to be careful about how it
 	 * uses the allocated space - e.g. by padding each element in an array of
 	 * structures out to a power-of-two size - but without this, even that
-	 * won't be sufficient.
+	 * won't be sufficient.  We take care that freeoffset initially is
+	 * adequately aligned, so aligning the size for all allocations guarantees
+	 * allocations are aligned.
 	 */
 	size = CACHELINEALIGN(size);
 
@@ -189,10 +191,6 @@ ShmemAlloc(Size size)
 
 	newStart = ShmemSegHdr->freeoffset;
 
-	/* extra alignment for large requests, since they are probably buffers */
-	if (size >= BLCKSZ)
-		newStart = BUFFERALIGN(newStart);
-
 	newFree = newStart + size;
 	if (newFree <= ShmemSegHdr->totalsize)
 	{
@@ -209,6 +207,8 @@ ShmemAlloc(Size size)
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of shared memory")));
 
+	Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
+
 	return newSpace;
 }
 
@@ -425,6 +425,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 	LWLockRelease(ShmemIndexLock);
 
 	Assert(ShmemAddrIsValid(structPtr));
+
+	Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
 	return structPtr;
 }
 
diff --git a/src/include/c.h b/src/include/c.h
index 7c57430..fb526b7 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -567,8 +567,6 @@ typedef NameData *Name;
 #define LONGALIGN(LEN)			TYPEALIGN(ALIGNOF_LONG, (LEN))
 #define DOUBLEALIGN(LEN)		TYPEALIGN(ALIGNOF_DOUBLE, (LEN))
 #define MAXALIGN(LEN)			TYPEALIGN(MAXIMUM_ALIGNOF, (LEN))
-/* MAXALIGN covers only built-in types, not buffers */
-#define BUFFERALIGN(LEN)		TYPEALIGN(ALIGNOF_BUFFER, (LEN))
 #define CACHELINEALIGN(LEN)		TYPEALIGN(PG_CACHE_LINE_SIZE, (LEN))
 
 #define TYPEALIGN_DOWN(ALIGNVAL,LEN)  \
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index ef89521..8f59b53 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -107,14 +107,6 @@
 #define BITS_PER_BYTE		8
 
 /*
- * Preferred alignment for disk I/O buffers.  On some CPUs, copies between
- * user space and kernel space are significantly faster if the user buffer
- * is aligned on a larger-than-MAXALIGN boundary.  Ideally this should be
- * a platform-dependent value, but for now we just hard-wire it.
- */
-#define ALIGNOF_BUFFER	32
-
-/*
  * Disable UNIX sockets for certain operating systems.
  */
 #if defined(WIN32)
diff --git a/src/include/storage/shm_toc.h b/src/include/storage/shm_toc.h
index 6822f91..2365b02 100644
--- a/src/include/storage/shm_toc.h
+++ b/src/include/storage/shm_toc.h
@@ -48,7 +48,7 @@ typedef struct
 	((e)->space_for_chunks = 0, (e)->number_of_keys = 0)
 #define shm_toc_estimate_chunk(e, sz) \
 	((e)->space_for_chunks = add_size((e)->space_for_chunks, \
-		BUFFERALIGN((sz))))
+		CACHELINEALIGN((sz))))
 #define shm_toc_estimate_keys(e, cnt) \
 	((e)->number_of_keys = add_size((e)->number_of_keys, (cnt)))
 
-- 
2.7.0.229.g701fa7f

0002-Align-individual-parts-of-an-slru.patchtext/x-patch; charset=us-asciiDownload
From 5a19816c3b4268d4e2279119e1c7c4272bad73c9 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 10 Apr 2016 21:47:14 -0700
Subject: [PATCH 2/3] Align individual parts of an slru.

---
 src/backend/access/transam/slru.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 36a011c..71b6d16 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -146,18 +146,18 @@ SimpleLruShmemSize(int nslots, int nlsns)
 	Size		sz;
 
 	/* we assume nslots isn't so large as to risk overflow */
-	sz = MAXALIGN(sizeof(SlruSharedData));
-	sz += MAXALIGN(nslots * sizeof(char *));	/* page_buffer[] */
-	sz += MAXALIGN(nslots * sizeof(SlruPageStatus));	/* page_status[] */
-	sz += MAXALIGN(nslots * sizeof(bool));		/* page_dirty[] */
-	sz += MAXALIGN(nslots * sizeof(int));		/* page_number[] */
-	sz += MAXALIGN(nslots * sizeof(int));		/* page_lru_count[] */
-	sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
+	sz = CACHELINEALIGN(sizeof(SlruSharedData));
+	sz += CACHELINEALIGN(nslots * sizeof(char *));	/* page_buffer[] */
+	sz += CACHELINEALIGN(nslots * sizeof(SlruPageStatus));	/* page_status[] */
+	sz += CACHELINEALIGN(nslots * sizeof(bool));		/* page_dirty[] */
+	sz += CACHELINEALIGN(nslots * sizeof(int));		/* page_number[] */
+	sz += CACHELINEALIGN(nslots * sizeof(int));		/* page_lru_count[] */
+	sz += CACHELINEALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
 
 	if (nlsns > 0)
-		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
+		sz += CACHELINEALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
 
-	return BUFFERALIGN(sz) + BLCKSZ * nslots;
+	return CACHELINEALIGN(sz) + BLCKSZ * nslots;
 }
 
 void
@@ -192,22 +192,22 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		/* shared->latest_page_number will be set later */
 
 		ptr = (char *) shared;
-		offset = MAXALIGN(sizeof(SlruSharedData));
+		offset = CACHELINEALIGN(sizeof(SlruSharedData));
 		shared->page_buffer = (char **) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(char *));
+		offset += CACHELINEALIGN(nslots * sizeof(char *));
 		shared->page_status = (SlruPageStatus *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
+		offset += CACHELINEALIGN(nslots * sizeof(SlruPageStatus));
 		shared->page_dirty = (bool *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(bool));
+		offset += CACHELINEALIGN(nslots * sizeof(bool));
 		shared->page_number = (int *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(int));
+		offset += CACHELINEALIGN(nslots * sizeof(int));
 		shared->page_lru_count = (int *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(int));
+		offset += CACHELINEALIGN(nslots * sizeof(int));
 
 		if (nlsns > 0)
 		{
 			shared->group_lsn = (XLogRecPtr *) (ptr + offset);
-			offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
+			offset += CACHELINEALIGN(nslots * nlsns * sizeof(XLogRecPtr));
 		}
 
 		/* Initialize LWLocks */
@@ -220,7 +220,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		shared->lwlock_tranche.array_base = shared->buffer_locks;
 		shared->lwlock_tranche.array_stride = sizeof(LWLockPadded);
 
-		ptr += BUFFERALIGN(offset);
+		ptr += CACHELINEALIGN(offset);
 		for (slotno = 0; slotno < nslots; slotno++)
 		{
 			LWLockInitialize(&shared->buffer_locks[slotno].lock,
-- 
2.7.0.229.g701fa7f

0003-WIP-Force-PGXACT-stride-to-be-16-instead-of-12-bytes.patchtext/x-patch; charset=us-asciiDownload
From 956044e7af1ee543c2e76d05b3367109db562477 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 10 Apr 2016 21:59:49 -0700
Subject: [PATCH 3/3] WIP: Force PGXACT stride to be 16 instead of 12 bytes.

The previous solution lead to PGXACTs split across cachelines.
---
 src/include/storage/proc.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c3b462c..d2cd71d 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -204,6 +204,8 @@ typedef struct PGXACT
 								 * previously called InCommit */
 
 	uint8		nxids;
+
+	uint32		pad;			/* FIXME: proper padding solution */
 } PGXACT;
 
 /*
-- 
2.7.0.229.g701fa7f

#147Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#143)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 10, 2016 at 2:24 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Sun, Apr 10, 2016 at 11:33 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de>
wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on that
machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run
the script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint bits
are set.

I also tried to run perf top during pgbench and get some interesting
results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData
consumes more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

There is a related fact presented by Mithun C Y as well [1] which suggests
that Andres's idea of reducing the cost of snapshot shows noticeable gain
after increasing the clog buffers. If you read that thread you will notice
that initially we didn't notice much gain by that idea, but with increased
clog buffers, it started showing noticeable gain. If by any chance, you
can apply that patch and see the results (latest patch is at [2]).

[1] -
/messages/by-id/CAD__Ouic1Tvnwqm6Wf6j7Cz1Kk1DQgmy0isC7=OgX+3JtfGk9g@mail.gmail.com

[2] -
/messages/by-id/CAD__OuiwEi5sHe2wwQCK36Ac9QMhvJuqG3CfPN+OFCMb7rdruQ@mail.gmail.com

I took a look at this thread but I still didn't get why number of clog
buffers affects read-only benchmark.
Could you please explain it to me in more details?

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#148Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#146)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Apr 11, 2016 at 8:10 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-10 09:03:37 +0300, Alexander Korotkov wrote:

On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <andres@anarazel.de>

wrote:

On April 9, 2016 12:43:03 PM PDT, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote:

There are results with 5364b357 reverted.

Crazy that this has such a negative impact. Amit, can you reproduce
that? Alexander, I guess for r/w workload 5364b357 is a benefit on

that

machine as well?

How sure are you about these measurements?

I'm pretty sure. I've retried it multiple times by hand before re-run

the

script.

Because there really shouldn't be clog lookups one a steady state is
reached...

Hm... I'm also surprised. There shouldn't be clog lookups once hint

bits

are set.

I also tried to run perf top during pgbench and get some interesting
results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData

consumes

more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

Could you retry after applying the attached series of patches?

Yes, I will try with these patches and snapshot too old reverted.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#149Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#148)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Apr 11, 2016 at 5:04 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Mon, Apr 11, 2016 at 8:10 AM, Andres Freund <andres@anarazel.de> wrote:

Could you retry after applying the attached series of patches?

Yes, I will try with these patches and snapshot too old reverted.

I've run the same benchmark with 279d86af and 848ef42b reverted. I've
tested of all 3 patches from you applied and, for comparison, 3 patches +
clog buffers reverted back to 32.

clients patches patches + clog_32
1 12594 12556
2 26705 26258
4 50985 53254
8 103234 104416
10 135321 130893
20 268675 267648
30 370437 409710
40 486512 482382
50 539910 525667
60 616401 672230
70 667864 660853
80 924606 737768
90 1217435 799581
100 1326054 863066
110 1446380 980206
120 1484920 1000963
130 1512440 1058852
140 1536181 1088958
150 1504750 1134354
160 1461513 1132173
170 1453943 1158656
180 1426288 1120511

I hardly can understand how clog buffers influence read-only benchmark. It
even harder for me why influence of clog buffers change its direction after
applying your patches. But the results are following. And I've rechecked
some values manually to verify that there is no error. I would be very
thankful for any explanation.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#150Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#149)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-11 22:08:15 +0300, Alexander Korotkov wrote:

On Mon, Apr 11, 2016 at 5:04 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Mon, Apr 11, 2016 at 8:10 AM, Andres Freund <andres@anarazel.de> wrote:

Could you retry after applying the attached series of patches?

Yes, I will try with these patches and snapshot too old reverted.

I've run the same benchmark with 279d86af and 848ef42b reverted. I've
tested of all 3 patches from you applied and, for comparison, 3 patches +
clog buffers reverted back to 32.

clients patches patches + clog_32
1 12594 12556
2 26705 26258
4 50985 53254
8 103234 104416
10 135321 130893
20 268675 267648
30 370437 409710
40 486512 482382
50 539910 525667
60 616401 672230
70 667864 660853
80 924606 737768
90 1217435 799581
100 1326054 863066
110 1446380 980206
120 1484920 1000963
130 1512440 1058852
140 1536181 1088958
150 1504750 1134354
160 1461513 1132173
170 1453943 1158656
180 1426288 1120511

I hardly can understand how clog buffers influence read-only
benchmark.

My guess is that the number of buffers influences some alignment;
causing a lot of false sharing or something. I.e. the number of clog
buffers itself doesn't play a role, it's just a question of how it
change the memory layout.

It even harder for me why influence of clog buffers change its
direction after applying your patches. But the results are following.
And I've rechecked some values manually to verify that there is no
error. > I would be very thankful for any explanation.

Hm. Possibly this patches influenced alignment, but didn't make things
sufficiently stable to guarantee that we're always correctly aligned,
thus the 32bit case now regresses.

Any chance that I could run some tests on that machine myself? It's very
hard to investigate that kind of issue without access; the only thing I
otherwise can do is lob patches at you, till we find the relevant
memory.

If not, one of the things to do is to use perf to compare where cache
misses is happening between the fast and the slow case.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#151Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#150)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-11 12:17:20 -0700, Andres Freund wrote:

On 2016-04-11 22:08:15 +0300, Alexander Korotkov wrote:

On Mon, Apr 11, 2016 at 5:04 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Mon, Apr 11, 2016 at 8:10 AM, Andres Freund <andres@anarazel.de> wrote:

Could you retry after applying the attached series of patches?

Yes, I will try with these patches and snapshot too old reverted.

I've run the same benchmark with 279d86af and 848ef42b reverted. I've
tested of all 3 patches from you applied and, for comparison, 3 patches +
clog buffers reverted back to 32.

clients patches patches + clog_32
1 12594 12556
2 26705 26258
4 50985 53254
8 103234 104416
10 135321 130893
20 268675 267648
30 370437 409710
40 486512 482382
50 539910 525667
60 616401 672230
70 667864 660853
80 924606 737768
90 1217435 799581
100 1326054 863066
110 1446380 980206
120 1484920 1000963
130 1512440 1058852
140 1536181 1088958
150 1504750 1134354
160 1461513 1132173
170 1453943 1158656
180 1426288 1120511

Any chance that I could run some tests on that machine myself? It's very
hard to investigate that kind of issue without access; the only thing I
otherwise can do is lob patches at you, till we find the relevant
memory.

I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

But I'm unclear why the magnitude of the effect depends on other
allocations. With the previously posted patches allPgXact is always
cacheline-aligned.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#152Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#151)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-11 14:40:29 -0700, Andres Freund wrote:

On 2016-04-11 12:17:20 -0700, Andres Freund wrote:

On 2016-04-11 22:08:15 +0300, Alexander Korotkov wrote:

On Mon, Apr 11, 2016 at 5:04 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Mon, Apr 11, 2016 at 8:10 AM, Andres Freund <andres@anarazel.de> wrote:

Could you retry after applying the attached series of patches?

Yes, I will try with these patches and snapshot too old reverted.

I've run the same benchmark with 279d86af and 848ef42b reverted. I've
tested of all 3 patches from you applied and, for comparison, 3 patches +
clog buffers reverted back to 32.

clients patches patches + clog_32
1 12594 12556
2 26705 26258
4 50985 53254
8 103234 104416
10 135321 130893
20 268675 267648
30 370437 409710
40 486512 482382
50 539910 525667
60 616401 672230
70 667864 660853
80 924606 737768
90 1217435 799581
100 1326054 863066
110 1446380 980206
120 1484920 1000963
130 1512440 1058852
140 1536181 1088958
150 1504750 1134354
160 1461513 1132173
170 1453943 1158656
180 1426288 1120511

Any chance that I could run some tests on that machine myself? It's very
hard to investigate that kind of issue without access; the only thing I
otherwise can do is lob patches at you, till we find the relevant
memory.

I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

But I'm unclear why the magnitude of the effect depends on other
allocations. With the previously posted patches allPgXact is always
cacheline-aligned.

Oh, one more thing: The volatile on PGXACT in GetSnapshotData() costs us
about 100k tps on that machine; without, afaics, any point but force
pgxact->xmin to only be loaded once (which a *((volatile
TransactionId)&pgxact->xmin) does just as well).

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#153Andres Freund
andres@anarazel.de
In reply to: Andres Freund (#151)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-11 14:40:29 -0700, Andres Freund wrote:

On 2016-04-11 12:17:20 -0700, Andres Freund wrote:
I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

But I'm unclear why the magnitude of the effect depends on other
allocations. With the previously posted patches allPgXact is always
cacheline-aligned.

I've spent considerable amount experimenting around this. The alignment
of allPgXact does *not* apear to play a significant role; rather it
apears to be the the "distance" between the allPgXact and pgprocno
arrays.

Alexander, could you post dmidecode output, and install numactl &
numastat on the machine? I wonder if the box has cluster-on-die
activated or not. Do I see correctly that this is a system that could
potentially have 8 sockets, but actually has only four? Because I see
physical id : 3 in /proc/cpuinfo only going up to three (from zero),
not 7? And there's only 144 processorcs, while each E7-8890 v3 should
have 36 threads.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#154Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#147)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Apr 11, 2016 at 7:33 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 10, 2016 at 2:24 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

I also tried to run perf top during pgbench and get some interesting

results.

Without 5364b357:
5,69% postgres [.] GetSnapshotData
4,47% postgres [.] LWLockAttemptLock
3,81% postgres [.] _bt_compare
3,42% postgres [.] hash_search_with_hash_value
3,08% postgres [.] LWLockRelease
2,49% postgres [.] PinBuffer.isra.3
1,58% postgres [.] AllocSetAlloc
1,17% [kernel] [k] __schedule
1,15% postgres [.] PostgresMain
1,13% libc-2.17.so [.] vfprintf
1,01% libc-2.17.so [.] __memcpy_ssse3_back

With 5364b357:
18,54% postgres [.] GetSnapshotData
3,45% postgres [.] LWLockRelease
3,27% postgres [.] LWLockAttemptLock
3,21% postgres [.] _bt_compare
2,93% postgres [.] hash_search_with_hash_value
2,00% postgres [.] PinBuffer.isra.3
1,32% postgres [.] AllocSetAlloc
1,10% libc-2.17.so [.] vfprintf

Very surprising. It appears that after 5364b357, GetSnapshotData
consumes more time. But I can't see anything depending on clog buffers
in GetSnapshotData code...

There is a related fact presented by Mithun C Y as well [1] which
suggests that Andres's idea of reducing the cost of snapshot shows
noticeable gain after increasing the clog buffers. If you read that thread
you will notice that initially we didn't notice much gain by that idea, but
with increased clog buffers, it started showing noticeable gain. If by any
chance, you can apply that patch and see the results (latest patch is at
[2]).

[1] -
/messages/by-id/CAD__Ouic1Tvnwqm6Wf6j7Cz1Kk1DQgmy0isC7=OgX+3JtfGk9g@mail.gmail.com

[2] -
/messages/by-id/CAD__OuiwEi5sHe2wwQCK36Ac9QMhvJuqG3CfPN+OFCMb7rdruQ@mail.gmail.com

I took a look at this thread but I still didn't get why number of clog
buffers affects read-only benchmark.
Could you please explain it to me in more details?

As already pointed out by Andres, that this is mainly due to shared memory
alignment issues. We have observed that changing some shared memory
arrangement (structures) some times causes huge differences in
performance. I guess that is the reason why with Cache the snapshot patch,
I am seeing the performance gets restored (mainly because it is changing
shared memory structures). I think the right way to fix this is to find
which shared structure/structure's needs padding, so that we don't see such
fluctuations every time we change some thing is shared memory.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#155Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#153)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 12, 2016 at 5:39 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-11 14:40:29 -0700, Andres Freund wrote:

On 2016-04-11 12:17:20 -0700, Andres Freund wrote:
I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

But I'm unclear why the magnitude of the effect depends on other
allocations. With the previously posted patches allPgXact is always
cacheline-aligned.

I've spent considerable amount experimenting around this. The alignment
of allPgXact does *not* apear to play a significant role; rather it
apears to be the the "distance" between the allPgXact and pgprocno
arrays.

Alexander, could you post dmidecode output, and install numactl &
numastat on the machine? I wonder if the box has cluster-on-die
activated or not.

Dmidecode output is in the attachment. Numactl & numastat are installed.

Do I see correctly that this is a system that could
potentially have 8 sockets, but actually has only four? Because I see
physical id : 3 in /proc/cpuinfo only going up to three (from zero),
not 7? And there's only 144 processorcs, while each E7-8890 v3 should
have 36 threads.

There are definitely 4 of used sockets. I'm not sure about potential count
though.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

dmidecode.txttext/plain; charset=US-ASCII; name=dmidecode.txtDownload
#156Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#151)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 12, 2016 at 12:40 AM, Andres Freund <andres@anarazel.de> wrote:

I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

It seems like padding PGXACT to a full cache-line is a great improvement.
We have not so many PGXACTs to care about bytes wasted to padding. But
could it have another negative side-effect?

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#157Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#156)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 12, 2016 at 3:48 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Tue, Apr 12, 2016 at 12:40 AM, Andres Freund <andres@anarazel.de>
wrote:

I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

It seems like padding PGXACT to a full cache-line is a great improvement.
We have not so many PGXACTs to care about bytes wasted to padding.

Yes, it seems generally it is a good idea, but not sure if it is a complete
fix for variation in performance we are seeing when we change shared memory
structures. Andres suggested me on IM to take performance data on x86 m/c
by padding PGXACT and the data for the same is as below:

median of 3, 5-min runs

Client_Count/Patch_ver 8 64 128
HEAD 59708 329560 173655
PATCH 61480 379798 157580

Here, at 128 client-count the performance with patch still seems to have
variation. The highest tps with patch (170363) is close to HEAD (175718).
This could be run-to-run variation, but I think it indicates that there are
more places where we might need such padding or may be optimize them, so
that they are aligned.

I can do some more experiments on similar lines, but I am out on vacation
and might not be able to access the m/c for 3-4 days.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

Attachments:

pad_pgxact_v1.patchapplication/octet-stream; name=pad_pgxact_v1.patchDownload
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c3b462c..0c29037 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -204,6 +204,7 @@ typedef struct PGXACT
 								 * previously called InCommit */
 
 	uint8		nxids;
+	char		pad[52];
 } PGXACT;
 
 /*
#158Andres Freund
andres@anarazel.de
In reply to: Amit Kapila (#157)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-12 19:42:11 +0530, Amit Kapila wrote:

Yes, it seems generally it is a good idea, but not sure if it is a complete
fix for variation in performance we are seeing when we change shared memory
structures.

I didn't suspect it would be. More whether it'd be beneficial
performance wise. FWIW, I haven't seen the variations you're observing
on any machine so far.

I think at high concurrency levels we're quite likely to interact with
the exact strategy used for the last-level/l3 cache. pgprocno,
allPgXact, BufferDescs are all arrays with a regular stride that we
access across several numa nodes, at a very high rate. At some point
that makes very likely that cache conflicts occur in set associative
caches.

Andres suggested me on IM to take performance data on x86 m/c
by padding PGXACT and the data for the same is as below:

median of 3, 5-min runs

Thanks for running these.

I presume these were *without* pg_prewarming the contents? It'd be
interesting to do the same with prewarming; aligning these structures
should be unrelated to the separate issue of bufferdesc order having a
rather massive performance effect.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#159Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#157)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 12, 2016 at 5:12 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Tue, Apr 12, 2016 at 3:48 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Tue, Apr 12, 2016 at 12:40 AM, Andres Freund <andres@anarazel.de>
wrote:

I did get access to the machine (thanks!). My testing shows that
performance is sensitive to various parameters influencing memory
allocation. E.g. twiddling with max_connections changes
performance. With max_connections=400 and the previous patches applied I
get ~1220000 tps, with 402 ~1620000 tps. This sorta confirms that we're
dealing with an alignment/sharing related issue.

Padding PGXACT to a full cache-line seems to take care of the largest
part of the performance irregularity. I looked at perf profiles and saw
that most cache misses stem from there, and that the percentage (not
absolute amount!) changes between fast/slow settings.

To me it makes intuitive sense why you'd want PGXACTs to be on separate
cachelines - they're constantly dirtied via SnapshotResetXmin(). Indeed
making it immediately return propels performance up to 1720000, without
other changes. Additionally cacheline-padding PGXACT speeds things up to
1750000 tps.

It seems like padding PGXACT to a full cache-line is a great
improvement. We have not so many PGXACTs to care about bytes wasted to
padding.

Yes, it seems generally it is a good idea, but not sure if it is a
complete fix for variation in performance we are seeing when we change
shared memory structures. Andres suggested me on IM to take performance
data on x86 m/c by padding PGXACT and the data for the same is as below:

median of 3, 5-min runs

Client_Count/Patch_ver 8 64 128
HEAD 59708 329560 173655
PATCH 61480 379798 157580

Here, at 128 client-count the performance with patch still seems to have
variation. The highest tps with patch (170363) is close to HEAD (175718).
This could be run-to-run variation, but I think it indicates that there are
more places where we might need such padding or may be optimize them, so
that they are aligned.

I can do some more experiments on similar lines, but I am out on vacation
and might not be able to access the m/c for 3-4 days.

Could share details of hardware you used? I could try to find something
similar to reproduce this.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#160Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#158)
Re: Move PinBuffer and UnpinBuffer to atomics

On Tue, Apr 12, 2016 at 9:32 PM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-12 19:42:11 +0530, Amit Kapila wrote:

Andres suggested me on IM to take performance data on x86 m/c
by padding PGXACT and the data for the same is as below:

median of 3, 5-min runs

Thanks for running these.

I presume these were *without* pg_prewarming the contents?

Yes.

It'd be
interesting to do the same with prewarming;

What you want to see by prewarming? Will it have safe effect, if the tests
are run for 10 or 15 mins rather than 5 mins?

Could share details of hardware you used? I could try to find something

similar to reproduce this.

Processor related information (using lscpu)
----------------------------------------------------------------
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 128
On-line CPU(s) list: 0-127
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 8
NUMA node(s): 8
Vendor ID: GenuineIntel
CPU family: 6
Model: 47
Model name: Intel(R) Xeon(R) CPU E7- 8830 @ 2.13GHz
Stepping: 2
CPU MHz: 1064.000
BogoMIPS: 4266.62
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 24576K
NUMA node0 CPU(s): 0,65-71,96-103
NUMA node1 CPU(s): 72-79,104-111
NUMA node2 CPU(s): 80-87,112-119
NUMA node3 CPU(s): 88-95,120-127
NUMA node4 CPU(s): 1-8,33-40
NUMA node5 CPU(s): 9-16,41-48
NUMA node6 CPU(s): 17-24,49-56
NUMA node7 CPU(s): 25-32,57-64

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#161Andres Freund
andres@anarazel.de
In reply to: Amit Kapila (#160)
Re: Move PinBuffer and UnpinBuffer to atomics

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results. Thus it'd
make it easier to compare pre/post padding numbers.

Will it have safe effect, if the tests are run for 10 or 15 mins
rather than 5 mins?

s/safe/same/? If so, no, I doesn't look that way. The order of buffers
appears to play a large role; and it won't change in a memory resident
workload over one run.

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#162Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andres Freund (#161)
1 attachment(s)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Apr 14, 2016 at 5:35 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results. Thus it'd
make it easier to compare pre/post padding numbers.

Will it have safe effect, if the tests are run for 10 or 15 mins
rather than 5 mins?

s/safe/same/? If so, no, I doesn't look that way. The order of buffers
appears to play a large role; and it won't change in a memory resident
workload over one run.

I've tried to run read-only benchmark of pad_pgxact_v1.patch on 4x18 Intel
machine. The results are following.

clients no-pad pad
1 12997 13381
2 26728 25645
4 52539 51738
8 103785 102337
10 132606 126174
20 255844 252143
30 371359 357629
40 450429 443053
50 497705 488250
60 564385 564877
70 718860 725149
80 934170 931218
90 1152961 1146498
100 1240055 1300369
110 1207727 1375706
120 1167681 1417032
130 1120891 1448408
140 1085904 1449027
150 1022160 1437545
160 976487 1441720
170 978120 1435848
180 953843 1414925

snapshot_too_old patch was reverted in the both cases.
On high number of clients padding gives very significant benefit. However,
on low number of clients there is small regression. I think this
regression could be caused by alignment of other data structures.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

padding.pngimage/png; name=padding.pngDownload
#163Amit Kapila
amit.kapila16@gmail.com
In reply to: Andres Freund (#161)
Re: Move PinBuffer and UnpinBuffer to atomics

On Thu, Apr 14, 2016 at 8:05 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results.

I think you are referring the tests done by Robert on power-8 m/c, but the
performance results I have reported were on intel x86. In last two days, I
have spent quite some effort to do the performance testing of this patch
with pre-warming by using the same query [1]psql -c "select sum(x.x) from (select pg_prewarm(oid) as x from pg_class where relkind in ('i', 'r') order by oid) x;" as used by Robert in his
tests. The tests are done such that first it start server, pre-warms the
relations, ran read-only test, stop server, again repeat this for next
test. I have observed that the variance in run-to-run performance still
occurs especially at higher client count (128). Below are results for 128
client count both when the tests ran first with patch and then with HEAD
and vice versa.

Test-1
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with patch and then with HEAD

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 156748 174640
Run-2 151352 150115
Run-3 177940 165269

Test-2
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with HEAD and then with patch

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 173063 151282
Run-2 173187 140676
Run-3 177046 166726

I think this patch (padding pgxact) certainly is beneficial as reported
above thread. At very high client count some variation in performance is
observed with and without patch, but I feel in general it is a win.

[1]: psql -c "select sum(x.x) from (select pg_prewarm(oid) as x from pg_class where relkind in ('i', 'r') order by oid) x;"
pg_class where relkind in ('i', 'r') order by oid) x;"

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#164Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#162)
Re: Move PinBuffer and UnpinBuffer to atomics

On Fri, Apr 15, 2016 at 1:59 AM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Thu, Apr 14, 2016 at 5:35 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results. Thus it'd
make it easier to compare pre/post padding numbers.

Will it have safe effect, if the tests are run for 10 or 15 mins
rather than 5 mins?

s/safe/same/? If so, no, I doesn't look that way. The order of buffers
appears to play a large role; and it won't change in a memory resident
workload over one run.

I've tried to run read-only benchmark of pad_pgxact_v1.patch on 4x18 Intel
machine. The results are following.

clients no-pad pad
1 12997 13381
2 26728 25645
4 52539 51738
8 103785 102337
10 132606 126174
20 255844 252143
30 371359 357629
40 450429 443053
50 497705 488250
60 564385 564877
70 718860 725149
80 934170 931218
90 1152961 1146498
100 1240055 1300369
110 1207727 1375706
120 1167681 1417032
130 1120891 1448408
140 1085904 1449027
150 1022160 1437545
160 976487 1441720
170 978120 1435848
180 953843 1414925

snapshot_too_old patch was reverted in the both cases.
On high number of clients padding gives very significant benefit.

These results indicates that the patch is a win. Are these results median
of 3 runs or single run data. By the way can you share the output of lscpu
command on this m/c.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#165Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#163)
Re: Move PinBuffer and UnpinBuffer to atomics

On Sun, Apr 17, 2016 at 7:32 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Thu, Apr 14, 2016 at 8:05 AM, Andres Freund <andres@anarazel.de> wrote:

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results.

I think you are referring the tests done by Robert on power-8 m/c, but the
performance results I have reported were on intel x86. In last two days, I
have spent quite some effort to do the performance testing of this patch
with pre-warming by using the same query [1] as used by Robert in his
tests. The tests are done such that first it start server, pre-warms the
relations, ran read-only test, stop server, again repeat this for next test.

What did you include into single run: test of single version (HEAD or
Patch) or test of both of them?

I have observed that the variance in run-to-run performance still occurs
especially at higher client count (128). Below are results for 128 client
count both when the tests ran first with patch and then with HEAD and vice
versa.

Test-1
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with patch and then with HEAD

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 156748 174640
Run-2 151352 150115
Run-3 177940 165269

Test-2
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with HEAD and then with patch

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 173063 151282
Run-2 173187 140676
Run-3 177046 166726

I think this patch (padding pgxact) certainly is beneficial as reported
above thread. At very high client count some variation in performance is
observed with and without patch, but I feel in general it is a win.

So, what hardware did you use for these tests: power-8 or x86? How long was
single run?
Per-run variation seems quite high. It also seems that it depends on which
version runs first. But that could be a coincidence.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#166Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#165)
Re: Move PinBuffer and UnpinBuffer to atomics

On Mon, Apr 25, 2016 at 6:04 PM, Alexander Korotkov <
a.korotkov@postgrespro.ru> wrote:

On Sun, Apr 17, 2016 at 7:32 PM, Amit Kapila <amit.kapila16@gmail.com>
wrote:

On Thu, Apr 14, 2016 at 8:05 AM, Andres Freund <andres@anarazel.de>
wrote:

On 2016-04-14 07:59:07 +0530, Amit Kapila wrote:

What you want to see by prewarming?

Prewarming appears to greatly reduce the per-run variance on that
machine, making it a lot easier to get meaningful results.

I think you are referring the tests done by Robert on power-8 m/c, but
the performance results I have reported were on intel x86. In last two
days, I have spent quite some effort to do the performance testing of this
patch with pre-warming by using the same query [1] as used by Robert in his
tests. The tests are done such that first it start server, pre-warms the
relations, ran read-only test, stop server, again repeat this for next test.

What did you include into single run: test of single version (HEAD or
Patch) or test of both of them?

single run includes single version (either HEAD or Patch).

I have observed that the variance in run-to-run performance still occurs
especially at higher client count (128). Below are results for 128 client
count both when the tests ran first with patch and then with HEAD and vice
versa.

Test-1
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with patch and then with HEAD

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 156748 174640
Run-2 151352 150115
Run-3 177940 165269

Test-2
----------
client count - 128 (basically -c 128 -j 128)

first tests ran with HEAD and then with patch

Patch_ver/Runs HEAD (commit -70715e6a) Patch
Run-1 173063 151282
Run-2 173187 140676
Run-3 177046 166726

I think this patch (padding pgxact) certainly is beneficial as reported
above thread. At very high client count some variation in performance is
observed with and without patch, but I feel in general it is a win.

So, what hardware did you use for these tests: power-8 or x86?

x86

How long was single run?

5 minutes.

With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#167osdba
mailtch@163.com
In reply to: Dilip Kumar (#72)
Document "59.2. Built-in Operator Classes" have a clerical error?

hi all:

In Document "Table 59-1. Built-in GiST Operator Classes":

"range_opsany range type&& &> &< >> << <@ -|- = @> @>", exist double "@>",

Should be "<@ @>" ?

#168David G. Johnston
david.g.johnston@gmail.com
In reply to: osdba (#167)
Re: Document "59.2. Built-in Operator Classes" have a clerical error?

On Sun, Aug 2, 2020 at 8:17 PM osdba <mailtch@163.com> wrote:

hi all:

In Document "Table 59-1. Built-in GiST Operator Classes":

"range_ops any range type && &> &< >> << <@ -|- = @> @>", exist double "
@>",

Should be "<@ @>" ?

It helps to reference the current version of the page (or provide a url
link) as that section seems to have migrated to Chapter 64 - though it is
unchanged even on the main development branch.

The table itself is extremely difficult to read: it would be more easily
readable if the font was monospaced, but its not.

I'm reasonably confident that the equal sign is part of the second-to-last
operator while the lone @> is the final operator. Mostly I say this
because GiST doesn't do straight equality so a lone equal operator isn't
valid.

David J.

#169osdba
mailtch@163.com
In reply to: David G. Johnston (#168)
1 attachment(s)
Re:Re: Document "59.2. Built-in Operator Classes" have a clerical error?

you can see url: https://www.postgresql.org/docs/12/gist-builtin-opclasses.html
https://www.postgresql.org/docs/devel/gist-builtin-opclasses.html

you can see screen snapshot:

在 2020-08-03 11:43:53,"David G. Johnston" <david.g.johnston@gmail.com> 写道:

On Sun, Aug 2, 2020 at 8:17 PM osdba <mailtch@163.com> wrote:

hi all:

In Document "Table 59-1. Built-in GiST Operator Classes":

"range_opsany range type&& &> &< >> << <@ -|- = @> @>", exist double "@>",

Should be "<@ @>" ?

It helps to reference the current version of the page (or provide a url link) as that section seems to have migrated to Chapter 64 - though it is unchanged even on the main development branch.

The table itself is extremely difficult to read: it would be more easily readable if the font was monospaced, but its not.

I'm reasonably confident that the equal sign is part of the second-to-last operator while the lone @> is the final operator. Mostly I say this because GiST doesn't do straight equality so a lone equal operator isn't valid.

David J.

Attachments:

snapshot.pngimage/png; name=snapshot.pngDownload
�PNG


IHDRF`��iCCPkCGColorSpaceGenericRGB8��U]hU>�sg#$�Sl4�t�?
%
�V4�����6n�I6�"�d������83���OEP|1������ (��>�/�
%�� (>���P�����;3�i���e�|����{��g����X����-2�s���=+�����WQ+]�L6Ow�[�C�{_�������F qb�������U�vz��?�Z�b��1@�/z��c��s>~�if�,���USj������F�1��_�Mj�����b�u���p�a��m�h��m�����>��a\�+5%��Q�K���F��km}������?�������D\���������!~��6�,�-��7��S��������v��5Z��;���[���r�mS�����5��{yD���yH�}r�9��|����-���������FA������Jj�I.��[/�]m���K7�K���R��D��r��Y�Q��O�-����Q��|�|�6���
�	(�0��
MXd(@��h��2��_�f��<�:����������_�����*d�>������e���\c?~,7?& ���^2I��q2"y�<M���d���JlE^<7����3R��E�9���`�3*L\S��,��#�)�]���_�\�,7Q����W��_���2�+�j���W��r�Z���L��lXswUm��������q��WF~������]<Yo.F���j�VN�D������,�'}(����}�}�}�}�]�;�����.ps_��j�Z�{y�g��k�J!#lr�6�Qa2�'cBQ�������/�=c���\�.V����M�UUT�p�)VoM8�A�$Cd��6T��W��"�O�RiS;S���A���v�m����n�R��c�}Y�:n�
�wK��b�6*��������L�hS��mZ������2�[.G����?���.�������#n���8�������H|�������2x~�����s��-��7;����t�>@��g���|U\�@IDATx��|����{�"E��H��i��b�HS)
RT�(������@�$�@"$���R ���l�����T���������7��ogg]��	0&��`L�	0&��`*�LL�	0&��`L�	0&��`fl,0��s&��`L�	0&��`Ll,�B��`L�	0&��`L�	X�����/�`L�	0&��`L�	06p`L�	0&��`L�	0&�,�X`��L�	0&��`L�	0&���0&��`L�	0&��`l,���&��`L�	0&��`L��\�`L�	0&��`L�	06Xp�`L�	0&��`L�	0&��.L�	0&��`L�	0&����,8��	0&��`L�	0&��`c�m[��z�"�� ��M+~w��{w�;��	0&��`L�v"�%C��|
�����)p�fE�|P�x	�����4�\����W�=Br��#����H�;�#G.��5	��C����'�����ZJ��O�������j�Fh^XND��E�1��� );.|��2*}i8��gl����.��m"oF��SN���8��.]��E6��P�r9�)����g���&	�!�T�>�������(P���,���F���e�&����`L�	0&���	���@J���Q���x��Rf��j(j�p�a��6�Ct���;���z5�oj*����W��CW����:������P����������$�����B��y#t�T7�&���Z��[����o���0�EV���4\�������Z4^��������0�g��� ��U������w���;�B1����~y����/�N��������IH��l9��������[[v����L�	0&��`L�	F�<�(d�����}����x���|
R���
Q���Y���
��!kGt�t�'�N��(\��dL\�=���1���tJW�I����\���{s����?����\��2��p��#��w8Z����P �HK���s�r��H��J��1DN��g{��+c��'_79������`���@@L�r�}w�6
����#g�`L�	0&�� �ZY��s!�M�o�	ow��g�WB����4-}&���<Av,n���zH�0���h��^_=�oG����i���'���?��8=���C
.��C� W�5�&]��?�`��x�����uoO��o�����k�!I�b_Jf�i�*�1���w�E��`����L����t�R#�=�U�N!�xyt�~��p��9]���M�(O��o���E�J��"����=�'V;�q$6������6��Nb��s�u��O��rc�+u�I���jA�%��%+`R��P�b0��p������
Z��������?az�(�L�	0&��`L�	L �����d(��b�W}P��*�f)�v�r��nCP�uW�����2�B�(�M�����;#��/�H]��x�<�j��+WPv��SQ7��S�<�-�;�����d6������V`�teBJ��sCj��������fP��s�@�����������6ZL�t	�i�t�;����P��W?N�lz}$e������0������h"��y�6B���1H�����pYiO�����)L���I I���=���/E�y�y�B�����v��R�d"�9��R���Z��dF�l�~�g$��y2`�;)�98 ��D�,S��J�W���c� f5�u��<��aJ;������?�%�_e#�����!$����M�%��$I�� �,����0�KAP0�EN��
�/���1�2R��;�?�)�8���9`L�	0&����l,�����������
�c��b�!�)<��	�O�5���pO�F^�C�
��X���1�i�s]}r�%�����k�0t��km�5H��
��c {�2���.����r��|�w�����w���O���2��[��q�������^�!���8��8)�r��p����{�&��'�3Z���92�c��#G�C�*�����Cu�C�s[R��T$e�k��D�N���]&`��8�m���������+[���*�)����D���b��Cc��2�S�di<t5sO�X���u;�u^������������32qre�������y�p���\��X-V�\>�-_��'�N�w2���+��QC���l�J�w�fa@�i��Utzf���bk�Z������G�h�"���1 4$?>x�947�M�5VI���+��bG��NS|��i��C�����4n��b(pO�����:LX�;.Hp��H�"3�E��h6$`�����iz��(W����Ig %�I{�����T��"(Mu+�a��qO~c���x�,=��w��)=qxj�.L�	0&��`L��#������ZK����jI�N����.Q���(A�t���G�A}X�
^'�NR��3�&/��<�3?����X[�.]<��6.\PB�������5�2s �<=��~��o5��
�����i�l�m��+
'4uh�z6Ay�Ax2��y�AC��uS�����*��������1�/^�����.��l����h��
���P=>��@�0�`�n���2d�g�'����{
����xk�d�\n���I���W�[
"X�{���1=:ik(�w��-��V�aH���Q����#1yw�.(�����l�/l
�r�9�=o-�S\���p�p<��P �+T�V�S�3B]�!<"'�g�k>){V���?����@�t��It8?'he�"�2���
D��O������d����b�og(�%	�ccq��i�[��F��6�@)O-�<��$qI|0&��`L�	0���W������R��c��c��p*�|/�����5�K�j�L�0s���������V�!c~���0S��U{A�0e��1�I
�U �������_X6�[L�'�@�NMl��h��1a���
s���y�9�8�����`�������@pI�����"}�2���s�w�n��	Ft�i�����O�c�nB��j9�� �8�3e��W�[�^���c����Fd�q�W��}�5�������`��b���]������7Rv.��[/��t�.���=��	�n������h����v��4i������.[L�Y��z��u���jTm��A�|�h��~P�����_/��5���j8�v.��=��D+<&���Zb��.��m1��Dj���C
�������#^�{�{��.A�Zah_����F�'b`������9Q��������(R��l���:<=�R�^��u�z�PX��� *U���_,�s�)������=h��:D]n���y�fT�G�K����
�.;�kYs��������n1���	C���}1�R���U[0}�d�Sm�W[����EeR(1?�<V�*f[�	'��Py�R-O�����L�	0&��`L�?E @cA.\0���=
��K:�p)%���������Y=,�}Sfj������CD[$���i������B���o^�YM����~oG�g;�o�B�n�b�,��&�����l(���W���*��}gL�D�|����t`2j���\���W\*�MPH�K4�����VU�^<����8��{��{A�������WP�	��2��^a��t+�����YW���X[guQ��(��>����T�����_Da��
�U�!}_�s
�K��W���r�CV�C��O��v��J��}��A(SL�"Tc�*YyM�pRv������g0��Jryj?�e_���^��0��uh����M�:`�Wo8����$���V��-(k���&�������n�j������5W�w�RO���l��>�k\�P��h��-,�q������x�~U�n��G��#/[������O��3}�@4��$����c_�9���g��K
:�5F�bL�	0&��`�
��`�v��`���0��iyx�	�l,���~K��On�wx�K�<����o�4l6�0HP���=�0��g����������~)��+�_�%$�/���j(Po��W��d4��������j�5O�����wS������Q��Y}�+oT1�����x���{���f�����
��i	��d��8L1�-S��P`�bsZ
r�f�6>�9�����VN�O5�����jg�/c��ztW��9�1��&�$���M���A�j���+@n#��v�������?��P���)�^�so-��s��{��"~�n�Y��
������W[qXnQ�%`L�	0&������`.���V�����]��������u�U9y��}�Nk��A1b�nrS&E����=��a��y�sv���1z�o�+GX���2s�-#~����o����AWp&>�~����������IO��'��O�A,�����L�$UBZf���W=z6�_�\E�
�L��{��b-���hVD����H��G�!��*��
��(EV�_;'�Q��'1O~��U�O+I����;i��S��n#q�|;?+����i��}����l;��h������,J��qO�2�!��j�N����	rzb������!��@�h\2_g������l����h��
�����Ps����^�����9�3���_�������JU��������`���:�������
�����T�T�� �)������������Rv�����.�O�`L�	0&������x��nWLs������-\���h�z�������������nFR��A$z����,������p����<��9W�������w���~����t��5*�L��(s_8j����������n��(��kl��K*.�dg���&���_Ob]�mp6Y�KA��L�t�qE��sp��Fy���c���@F���v;9�y��/��j����^	�0O�����i������;m��w8�
[��1K�Q��I�]��\?\/����Z����n(Poy�����f/���	^.��}�G��}z�r�d(5��S9+���r&c��t5��
>;#g�_u*O�9����y�%�8�poM�����;}�!E�Y��V�(5>��04�3&��`L�	0&��xe��7?=T��������G�������~��(���u�_�L��<q:&O��������?�0<���\8w�pl���'������$�n�z(�s���[���'#�IqXwELt�'�>2�!{������@{*�D�;s�W*(?���|Kr��� ��%�K7E�k����e��t_cE�!
�y���vl64�?e
�@�i���8R��9(y�z[�@@m���������"��c�K�Q�ZY2^\���&[�0|pF+
�����J��`�3��!/J�1�����|-��G��I��9x)��f!(�wk�TF��!��UK�����b�3���������m���-�q��������Yf���U�`����>��}�q�:|4�����'W*�������}�R
��t5l|�6R�
����'O��E;�����A��L�	0&��`L�$�� �
��R|���3��������5��E�"}����s����i5wCA�������P����$��9��������g��|x���x"#E���,a������E�����_p-%��������|��f��=mD�������Q���Y����8��k���fG�Re7u�����e^2dm��)CbZ<6��P��"
�z\<��0�Z�=��1����&O~��B����h��eW"&,<`lt��;/������I�����A_b�gg��C�I�Vo��7B������Y�����;�H���'�i�3����E���>2*(�f�'��l[�����EK�#���W����^�`L�	0&���?O ���Ah����e��]�e�����'������a����}�qt;|;�)�}����3����r�
��>���}��U#����	���-'/Y��
��?��}
��x ;�oA��G��tb7�'[f�f��?WNuY�U���$��qs|��j03������Ag@2r�C��F�����l}u�qC��'��>�I.��>b�b���b���N�3xS��W\E
+O����9�kW!����*��*��T���Jn��v��;H�oY^��4.g��	���b�����"��G����V����6BS�]�t�oMg}R#W�`P/r�y�^J�v`�))��
���w+�{s�;��)��#�V5��<����C�j�����6V0���<�������	0&��`L�	���I����~d|.l��	���:�/�}_|E_<P�6�����Jbb�E��?��];����'�(We�kO�O����`��t�z|��[���M�ly��v)@�Z7�q��nK���h�R�T=L��/1b.�M�`��F����{�K�~�4�\���,.X���+.d!RQ4Q?����}}�a��X1�3��
��C�5	~���'�Q�R,V�;��k�^������R�i�X<�j�/F�v`X�Vh��"B�����
�CO~���T�
��x_����ft��G#��h����21��tcu3�|h�JL�xXM�%<�y<&���kG������
�WE��E�uD'�o�,�O����	�����R��_��y�:��SxFF�Y��`r���� �Tk��9�����j�G��T)q�a�����iE�;�6�.?�;���qM#�v�1y=�K,[Y	��~�	F��8�{,��������7��1��������XF�����Fb�j������M��);��'G��#90�{S4�UN���H�G�D�R�J��h���xgY�������g=QH���+1��9��kJ�p��N�@%�����A��d����#�G�����Vz=���/�}�%�8���s&��`L�	0&�_!��(��F=��Cw��O����&��?�$��s���uc�6p���>�J�E��7k���>�����~rS����������q�����}6���pW�	<��|7���c�<��8���#�G���#h��=����W$#^�ChO���D��ZA\
7������5R��!��=T�2�?�S���_z��,�|�]�������I�Czg
���K�]W0���g/�������o�\����z�7�.F�
0{Ry������g['������F�����6~���#�%���r�d��G���T��t�<^�?��m�d��g!�k���
����
�Qy�r�q�����^t������2���+ ���^�^Kt6���P~��>*�����O������}b���~��H	x�
�l,��=OT�/7D�A��a���i�=���Wp���)x}���-�>@�F�����&5��%R���:
����|I�73��RM�p�m�Y������M`����� �\D���d,��@����+&��`L�	0&��  ��!U��OG�����MB�R�v�[hX����%�����#��+�� ���LZ�vk��~s)���������rc���Ip�Y/{o����b���X6�*��x��po����xYR�l�\�����j$:O�zd��,�R�`�����weS�(^]����.e��&{.z����I�����-���m�2N����'������w�c�p��Q\�����d(p�,eA���~�-=j�&�����X��>����`�i���PQ{e��S6��|s��]{���Em��B�z�i6S����m�z�]�5�'�+F.W��n��x���G�T�F�{�>�i�2����w�z�I���r���=�Qt)d�Y����C!UEK���R�)D�{m�+dvx��.:��zM0�����'�7��B�s������|�L��p_��|���(T���A�|���x�.r�i�t-��"��^�������gL�	0&��`L��A�%�q3���;�S�O�b�Ud���K������E���#��� }�>����2�oV6�Y��^�u�^(U��K��I���z�����s���&2�G������������^A���P�PAyW����������z�d�����p�?e���hH�;.�0��O+�9i����J;����<B�i�F6d���s�������@���l\N��p!���x�E���J��J+{
+��E���<�hY���q8�@�3h2L�;�Z�
;�QzOQz���HG.JGJ���J�\8���kW�����"��"�:Ku����a��P�dA�uL)�O����7�h/��V����n����2&��`L�	0���M�5�9b&�F��X��`�/%���K&��`L�	0&���H�kV|��`L�	0&��`L�	�I�Xp'�&��	0&��`L�	0&��@`cA@d��O�����
Y
&��`L�	0&���mA�����P��d�	��������s�Q��P��,��;L�	0&��`L�	0���opxg>'�	0&��`L�	0&���~
��
�1&��`L�	0&����	���.�|N:`L�	0&��`L�	0;l,���nL�	0&��`L�	0&��bl,��3����`L�	0&��`L�����`L�	0&��`L�	0��������3&��`L�	0&���#��;*���`L�	0&��`L�.&����8�9�L�	0&��`L�	0&������
�1&��`L�	0&����	���.�|N:`L�	0&��`L�	0;l,���nL�	0&��`L�	0&��bl,��3����`L�	0&��`L�����`L�	0&��`L�	0��������3&��`L�	0&���#��;*���`L�	0&��`L�.&����8�9�L�	0&��`L�	0&������
�1&��`L�	0&����	���.�|N:`L�	0&��`L�	0;l,���nL�	0&��`L�	0&��bl,��3����`L�	0&��`L�����`L�	0&��`L�	0��������3&��`L�	0&���#��;*���`L�	0&��`L�.&����8�9�L�	0&��`L�	0&������
�1&��`L�	0&����	d�� �h��'��H9��������@C���s����?���{�Jg�y��c8�O�{������J����wRZnm!:��������^#J>{[6��_vF�W_|��H@:�m7b��(��KA�;�0�������2j,���v?P��p�{���'�O-c%&b[��X����(cc��K�{�D�����q�����r�d������'�������]E���u85�A���&�bVO���'m�CP�HY����V*x�����9���)f/��3W��P�Zh��q<��A�����}�,��l)~=~���D����z+��I�k0���x���fr��49z&
���N������Z�������t�W����<g3t��4�����)� �p�2��x	�.�s���,6�����.	�elQS�C�����������6��Q?�#����p&j��n�n�W@u�bE4�����
%�r��1e�nG������Z=S�X��gR\�<��h�N�?��S&�h/�NW��]F�,�2��iR�EZ����%b���������*�#+�%+\T���i�RC����h7�
����O�TW8��L����!um�m����K%�/��
K�d��c
6���u#��!�58Y:
��/�Fn�0�#�bj�op=�Z���;D7�@
~"��WC����H�������0uk>��
%H�m[����6yD�v��}M��[O�w�1������E����:�		�7��#�o-�q�?)����?1����q��2��uw���s�:�\�1���0
����OT�9q�B�_?o�=�P�9�!�W����q�����x^�5��S��O���[�p)����l�������]��|�CC%��9�2�`eA<v���S�cq"&�L�qc�""�$bcw��+�Q����{C��pG����;�4i2v������_��hV�,��!)�����3�����0��2��w,.Ux���'�o�a<_>���
����3������o��=�����lz���q�j�R$(6�t� ��|�B�A���/97��Q����kb)�����L��r�=�i4:N��8��nR�E��EF�!������7wd�d��vF��"c��]�U�B��������cr����=�u��o�Fc� ��O��NT@����<bD�.�9��_��&�v����B�[������q�>i1�K�Txw9��5����n���]���=W��5���x�n�?&��b�V�l
�'x;�B�yh�Y07��y��~��w/}�+�R��mS����W5��."%
��������sb���z�������^^�g�	����C��#���l(������kw����7�_�����D���sY�=�X���n��6I_J6��������e�X���4?������_����koi����T%�������b/���������BJ����Y�B���_N�*���9b2����>P�}�q]�'.����H�-��g�%��Y����7���Oo�o%O��e�t7��)����,-qNEJ�7������%K�amO���s��|!�O�H@�Y�����,���J������5,`�G��KIAH��(�d������X7KI�,�ZOnZ�z.���rgNQ������9�D:H-���_""��B�
��>S�CU�R�~�)m�|�Z�Q�~w�<��z�=Z" k/��6�3��&��E��f����nx��Z�(�O=1�#^�u��i wC
h�-�9���"�����[��S�K�(����o1���m�~�z�+e.�����t�e#g������b}�i������7]jeB�����jl�����r>$c�{c���v��g�������9�rW.����e�&-�j=p���:gJ���z_p5�=����`�6�IF�S�)uN��u
HtY$��RvE`[/�}�6�
�����Z/�����]Y�c
D�x�L���.���[%���,�s��`��p���=����w����!jx��m|"��P<�������j���F_is���W����RM��H��$��[��5/��j��o���t��[P��-��2c���a�����K;+�J�z�'&����z��-2�5�&{+��]��L��
�������������s�s�m�K-�>�����:�w
�h�=��Z�M���Eg�)9�<N�;I��L�������U�����8t���KCA��/����'s�(�Br�L��(Km��c��=qt��X
�}�v�x���9:��Jd�j>�����V���W_z�u\3�7M�`XkyM�C[c%��2��,7fJ�Hms�6fR���p�\om�Y�8��{�K��?%��y�7�N�^�����Q�h;���$?���y���s��]W�O�]v����"���c;i����*��zl��Fh�gr�s	)��{�������p�	��^�����2�*����f��<_���9�o}�%�e<c����l�p~�G�t�����Q�a����HE������9�
#�"�~�ONV�$Q.�����`/Qa���:E.b�p���#G�!K�(S���j���DU����"9F\i��K<�����#�;vy��D�RE=����O&�'
9(����X��x?�%Y3��
��>������-M�������2M$O�0�E?�?��=������J��#����k�<�<y�j-(�ju�n�em?��=�k�I�����r=8CJ�V�sdj$U��]���eG[i����Hip���}���mq&����%R�,�<�~��'j�H��������)��|��J�{U��e�VE��,n'IK7��}zR��zb��}�������B��K�����D:f�N�v�����x:j����3�e�^���������E�3���H��Ky{�6;���V
�����{Fs)��t�^N"���0h�r3���HM^c�J(������]�|��.�E�kd-�DR������5�BcTw���Gq�r5�b�e��]�9���r?4C�'IQ�%%o�<���_Egr�)c�#��;~�Q�e]��������c6Z�6.��������X�V��'Kj��F-d��/I"���e/���9�������"��k�I��8�"+W�vr9m��n=�H�9�D|Zp�l���=����j���V/��k��Z6��7��#��r~���+������)�ATW�3����rm�~�����P}��5�C��r��i�8��%3z��(����5�<u���
�Poy!n:�{Jy}m��12��������]��Z��Y'=�$o����D�t�v�gXSR����ew��.j�^�&c�Y�$iRXK�,�S�k2���V���S�F�j�Uz���_�9u�����S�}����Q��diV�0�O(�U�.Y{�C��Z���:S�vI�������o���.����ZH}�����!���M�����C����=�E�C��j�>�SS��w��WInU��_z������v'����~�HZ��"i���R���M:z
����Cu������5T���!��"�[�^�eW<<p���<�_������x9�'�������B�G���KYK�6�e���C��[_������PWS>q�U�Ff�P�A��1g�A���a,bsI���2��UO����w�������v@N��OkG{�3�{�^����:'����F�JM����<��}����S��������!����O�
����>E�p����zr�{��w��k��i<*b�U�}���aVD�/�X���Fx�����M���P[^��;}#]�=����7�/�������V�)���k&-��hcw,�����m9]�3>�I�p�SH�>�$��;�C����~*������<?L��A������Kr���c��N��q�4J�K��P:��W;q��4?��\|�����D����+c�p��D�zz�8y�0�����d�����X@
��9����t�j��yq�J�
)��aM�i�O����4Q�j��=�,%B/�m�g�3g���+���L$��T:Cm���mr�'O����g�W�E�8).N)1�$�����������Jgi?�Vt����gUJG/�#�����t��P�'��d�>�0Nd� )&9��"j��:v��)���t�	j��z�O�/����]LxF������$o1H�&��{����\���@+�O�Y-��t�R6D�g�3&��y���������~����������1t���&<a��D����S��4�ZZ�,xG����A
�L�w��9�Zg+OZ(�2x�J��$�>����Yke���xD%��KQ>�H�e�8��*�h�k�nqhi��3/"�8$I������e�q��xJ���z�2�t����C�_���G-�!��t&FK�Y0����\m&)~�n�U"?j����7�H�$��8j#���h�T~�$�z�����S�Xt�W�&}E��(���VM�+K �U/)F���J1�,�����k�EL����4���M/s��l�/�����a��9R4���N�({�]V���y�LT���J�E�T'�m���zZ�m�y2����@r��{;�����J�c����������wk����\�"� ud��|��������'b�bl�Y��V�"c����U�'��O�-�l�\g�������o/�o�4���2���%H>/��7ZZ�u�����i`���	��X���������~���"����@:59S��~��G����1����qn�~@���>!�J}����|j��..A�&��
�r�l(d'c��]G���W��Go�2�<�jk|�������*Q���P�="�Yj�!�{�cy �o[Jz��#)�����q��~1FL:���������q�i�;b<����KW3��23�����y�,�5�6}�1���N�>K�������O�����&%�H���X���p�}l`�����;��_��U�1C����r���J7�k�����������1���qF��!�����Sis����AX+Cb�y&��g6J���i��>k>i��}<j�y��k�y`-r����1�S��bL�e�.A\+���
m.}/S��M�1W�%��6��Iw�Hk�u���_�������6`�Riw�ziH����+R4�R&�J!���[�a�
�D>�����H�Kdc���R�!�?}@���Q���c�k���.�7W�?)�,r��)d�#�7�$\?��>��~���q��\V+��?�0��Z�K��U�[��w�^���3�X@�0(�Ee�M<���
F/x��F��#���_k
�Zz��3#�����=9���dV�jA+}�?j�{Vs�1��NU8r����n�QZ�t����2Az�s�i���uQ'$M�a�,Z=�d�pS��_)>�����������7h�8����|�P��QR:5{���BHjO<4������T+���G�:h���������u@�����
���bZ'�Jc�4��tn�_������<���]��i�>[]R���F^J^Yi^G`-�b� :"�����9O���2�O �\Q���vuV�g5��L�r�4�N#kN�����fX���ieP�'F�H#��#���M�IC�RV��~M����)�}MK����r��'��HwyQ��Rv}�]O�����K��iLC����vOm;��������h~<�kw�_����S{���t
k�F��o������w��k��$I^�������6ZiW��S������7�6��0pz;=�t��zB�h/-:��N0�5��O�}�$���o}R�[TV�A���:?$�Q���mRu9.�S����Xy�)j���u��6=�Z'F�	��R���Sg�`�����#������W��'�^���F�;�����F�����-�����D�*A\��D;k�����A��������$����NNuMc�>���{��"�1��vR1pVO������9��\�g������v�g������O�=������us_o��w�[?e��O�E��)
��n����OZ�X�jc��/���*�������Zu���N�H-��� M�v�O�0Iio����N��O���,m�f�'���3aDV��5�]U]% I�����q@e�,����������b���=��J����������3�����M�"�0�9�%C�ee��X@>�m�H�_�I�2��5�P���/*Q�!��B��`�����^���6�}.��
�UVR��[2������:���������s�Q��%`p������`��c����(���s��J=�#�����b,��E���{{�P�c+�d��4��4����
��������k�hg�rO���5����E��LyB�!g��z����;��
��s�����B_���<fo��5����~������_���������v�OL���u9�o�^�x�6�^�h�
��'�i���+8�m���1�����vr_���U��$�dy9�E��H<A����lo�8&#��+����������U�B�?�O_���/6@���2}���W=���[�F����o$��5	ah^?2r��s������ne7�4p�����]���(v]����Hsx	�&�!(B�*��=�����-�D���/����R�B�Q��
�\�W�W��s�O�7;�V�����Z�!�awL2�_��U��.�� �=�1kU:>c��,��R���/mKb:BQ�v}�o\�GD ��x���������������a��u��0���� ^��]^��
���e�����
�[��t.
3�i�����^�5M��gJ�h��GP�S�w����JL���z:��N���v�)�Y7q��|�(�{�hO�'�om(�s��v�������^�!�
��w��c�0'[)���8�F/1��)a���������+��`
ybM��J��&�N:kB�M��i�&�K@IDAT��U	}����oH_Q��&�V)�}��u�1��o������:���C����
sw����C����|��s�=�No+w,����_��8�x�i������B2����y2
n$�����:�5T�}�M���(y�]_��P�b����J>�mO4a����U�r�;��\��XE{D{K�N��X�������:��e���������+�fZ��;XN�l'��!�����5����FWy~�S��6�����s1S�Jq]���*h���x�Z������9�d���|�q�>jz���S�G�'t/����Q����tlm���8�3D��
8G��������?�����h:.
^�j�R��Y(����{;�������Zp��J�N��C��#�����2IMEb��(��$���~�S!�Z�}Z���jd�U����R�"(���O��������*N_����By���-���o���C�'�m�~�r��N����@9^�M��x���g��M��u����V���������!�6�R�6��o�k�o{m|�)8�����'�Q�b�;m:��g�p����{��59�������zG�y��[��4��r/$S'��'z�/��(��y��Kx��p�m/l\��w��f4I����9?Oy�x���i�~����r��sv���~f���^"������Q�� �9�6&)&D�����
C��H���mT�:��&�zq�*]��-9z�����g[���(��z
z����h��$�l��L�%��~��2�v���:ecD��7z��`��yS�.����<�p�����.�zb�O�b�?��sM2��.jl%����ZoyP2�&�������#%�:���_B��M�z�k����x���>��3��+�R���Me�AU�x���������3-�~CK�I�����
��H3���g*y�o�5w����Y�M�{�uJ����%�v@x�@C�AsL�qU��s?�����D=���y����YW<}���{;��=��F�����-r(�r4~��D�L2�.�����C���gN.�t�E4��<���h3�{D��>�i4���j��S����Gk`
�����W�;�E�|�'E�@�������J�`S�(\���r+�&T�V����z{����W���!�YWq#$%�T67�����������z^E�Kg�P����'Y���/,��\�7+N�|o��!=?���I�1�s�X��Y�][������xQE�5�W�o����8�'���Ez�������
�f�����.{i��3�h��oztMJP�R~O�)*�I�z*[F_�yY���'���4��R������t��4P�{��O
l\���<���1�?�9�q�}�=JZ�sw~�����N������?iL�.v��M���E�����~�����^?p�&���~o���q��@��k���Tl����=�D �����
7B���"���+�@���Q�8�s1�C"$���_�E=�:�$�����)����Y�����M���
�hSEK0nx�9�a��q^s��<����	s���l���;������d���O�S���Q��V��h��+W�,�c�S�������'K!y�R�'��L�C��6z��j��j���q	�P-��C�Jb�a�#s!*��G������'A���%����mB�p<�����i�:s��oA;��N�@�
�+Ce������.��c�����p�9�|��<m���g�)A���(N���R�!Z�JJ8���v�5.�,�+t~��A�:�5OD���u���H��X��[B�`��|�d����������B���������UB�6��Fi��A��:�ZOlc�zQ��W-�<�5|+�q$v����!�K�qZ���%?�_!����0%�d��x{��sI\�����E`W/�x��\�����L��v7���7s�/�W�Q0�./�e75��e7��+������F����6�+��B1��W�������{;o
���r/���g����n���jh\�S�
��YHI��eS��j�u��m���=���Q��H����lE����mI�'����~��v2���r�X�����W6P�1~��	����~��tQW�7��T�.�s,��Sf�A�li��98��I�nTB�6���g[���D�
*��W(���G}z%�����:�����4����p��������������J3-�P���(����_������
���e�R�}M�����]�������K��:��$��P:x���F�PkL���h��0�����a����������@G�O���	�c��i'��X#�[V��Vu�������a(��3w�Y��>��q���)-��C����e��O��U�\�p�fM���������<u.�~�1}Z9���M��>�u�d.>��b6����������$����q��v1�G"~_A��h�7=�N���G��Go�qM����a1�"�F�&��8�
���%��6�����x���������;t_	�jX�:?M|�B]Z�&O�#�c�;�B�-�|j�w��9�{0������~�a����2��a���X��������N�{9���[y�N�]�Z����0�SZV�������k1~�x�o�*��j�
Z����R��6*������e����Si�8Uip�'�>�FK�:��I�x"������Z�is�F��3��h<b��a��w0?2}�������i�h���(�|��>�W�3_��p��/iI�`"��q��zP���,|1k>7�������Nw{i$�l����XDm����u f-�@<��.����Q:[����,��X��c^mT�l���Y>y$e;����`�j�o)6G�f����xO^"��'}���+f�,th1�kr�������Fd��d���d�yCHOj����pd�Yr����o��e*g�Fa\����\��U���`����)���B�f�
���8�z�(��=������^�I<O��Ju����&��c�W�(��n���=&��d;��3�����(�pZ����h,��p���1���6]TV�Q�Igy���O
��%��;���_#R|�r��^DJw���1sc$���`��!����:n��G���zO���������{���^h�x�
�~��-�]?�j�+S��u#66��^��[ox��P����U�Z�	����	�^~NY���w���	�����\-S�1�ys�������.�q�+/���t�<={����x�� ��vy������z���k�s�zzog��_��SXM?�7����1~����;�����q
�v%�b�xT��A���Tj�����J[��N3����=���~��D)�N>n����C{��A1zr/>'x���X��>������P�L'�6��I��1b�,]0K����W�����La�Z����^���'a��o1�]0�}1o�ja6G��h:4��r��'1v�lL������z��x����z.�_=���?���V��)z�T��t������L�x�)!7,����_F�B9�����E�v���~�W[�_�����K��I��!�I��L��F������?��Sy�Q��;�,���z�8�7�n�������G�)�1b�(��!B~Y����������/.������If�����7s��>��q����U��C���W����F���h�k�����5�X9j�2y�dM��+����r�;������U3��1��������A�i�Y{6z5a
v��������������u��]�Hu������s��h�wcn�QX�b&��?Y��3x�F{}�|->�8��n��b-������`���{�~�
�UIi��<<AY ����c��]!����@�E��>5����:r�Z��+��7o��S~9�-R����t'���:[���g�0�?�	�����������4���{c������b�~e�Z-�������A�]Uv:K_@Pv�6�;-���)�7h�V����_;8�m��Y�&'�#�-_Uq{~�Q�9�6yR7��i����F%��������.1����8[���o����3��i��8iRe�f����k\E��P�T�{�������.5�o�GzKkL���� �k[��O����)����Y�8o���fRB����F��,�����&;]��flJ���,�S6��hH���k�����f����f}V����e��d�4����!Q��L�e�t*)j�^�B1���Z����D��3j���,�M����m<6���!�=;���0�G�s�"�Z}4����
H�W6�G�6��q��Z�����:�!�@���,��m�DS����U������]���[�������������[��v������P�1W�f����/K�f1z~Q��G���_qo�J�sgF:��K��v�Wy!f�'G�x=�.�x�����c�|���3��c����������,�ZZ������:���������*��K���������|��E	�Y�#�[��}�D�+�@��Z�/ g���a�y3�:.�	=���p3���\�5_�:kw��di�s����V�1�q���H����im�U����������*M���V��Ro�4W��J��TC�O5C�/_���U��_��:g	�������)}��!���L��K��}
!r�+zx�M�g�+R�<�z|:q��,���^�����9Wc�=���rcW������=���^��C|JO���.�����4>',6������~Ylf��C�7���KW���h����C���3�I���6�1:�����>Gn��I�bQg��1����
����g�6�����D,��ub#@_�)�x3d��~N}��>E!�$�8�4��:������a�'���T~|�k�y�h��,���m�K�����'o��������U�d��c&D��E@k�����n�?�h���v_�-�������\$����"�
i�E�f�Bv���G���
M����}j��m���P6�m@��I���6>�H���}NQ�m�.���d�Pe'?z��]�3�n������6L��}1��$�����z��������&��=�54�s���2��;h�������{i�����	k��<J�l$��fV�/rf4|�8vK_X��ny/�����w���_�T������o2��Vo,���������3'�l�����]�'����&�N>�+��$�RR���1
��3��c�4���?���Q���:Kq���I:������+����R�'����*�&J���-��]x��=��c��^~���Kz���oq�]���w�t�Y�U~$��o����F��t�5I�x���������N;���[
.����9}��<�&�D\�%���S�34������W�-�q�&��_�8�S9)���nw&^AO�B)��������x
���E��H����+JDp��[Un��7����73�#^��?{]�1�A,za~���������s�!r������>��}���njq�6�����xe�$�s%>?�#�^�G�C�o���;��wrNsF���a`w����������(t(u�h��{{���-����"w/����� �V#����B�y�Rh5*�^���S��g�/�zv����<�%%c�:���\6��Oo�����-�7�t��S��t�W
o&��lGy>2R+6�G����.�+A�{�-�N|�>'�����>k/Nn��M����������������	��,@Y�}�����d(�tL��R��A+_�"���?~���WxU�@���-��>��{t-�&?��MP{'��b���W�|����w���������r|���������l�a�������YF�����pA���C1 ��H�	S��`|Dq����H���P ���m,Z����Z����t���5���*7P�z"����D�
4nG���Z�~� u}�2|���>�������C����C?���L=�����<��o^���o������r��p��z�5��=������rH��������8l�:o�����P�����ec����J_yjj����#S��(��o�{�bo�U�<�/ *�a�����=������_H9G��3\���Y��w��3�eE��]K K��#�����]K��Nx�V���Uk�����n'����@���\Er�RicQ�.������]o�q����?�f?���b4N$&�R�`��X
�J�oLq7S��3���@J,����Z��o���'�	0&��b������Q�R�����M �������j��/@� �gb�`����V���_����ld��N������e�b��`L�	0&��`L�	�:���o]X2`L�	0&��`L�	0&���X��0Y`L�	0&��`L�	0�;���\�40&��`L�	0&���@l,�@�,�	0&��`L�	0&����@��wB.r�`L�	0&��`L�	d 6d L��`L�	0&��`L�N ���;!9
L�	0&��`L�	0&�2�2&�bL�	0&��`L�	0&p'`c�����&��`L�	0&��`H���E1&��`L�	0&������N�EN`L�	0&��`L�	0�$�������`L�	0&��`L�	�	�Xp'�"��	0&��`L�	0&��@`cA�dQL�	0&��`L�	0&��l,�r����`L�	0&��`L 	�� a�(&��`L�	0&��`w6�	��i`L�	0&��`L�	0&���X��0Y`L�	0&��`L�	0�;���\�40&��`L�	0&���@l,�@�,�	0&��`L�	0&����@��wB.r�`L�	0&��`L�	d �,(�oQ�F���d�*s*U�k.�T��O"8OI�?P��;�`v�\�5@WOcD.e��

����(��3&��`L�	0&p[�Xp�F}��]D�d�\r�(��$	7�L(]�~�2*euJ�E���fv!su�Z�����_���B|	D,+wS�C��a����+�J"s��/���g�������H��o���F��luSS�}�\L����%*c������z+��~��w�B����,�2�p�\hR�6�l_yZ�m=
�� 9!��e'���i�|�S,L�P�A�=�Q�W)q{�|�F,���]�p��Kh�,�Q�,^x�q�����#��`�����Q�AAhz_�����e{{�:2��(A�+�_�H~H0i�
};U��4�o|;�;|ug�/a����O4E�N��;O���K�_���������
%$��V(\];6EC�/h3����9��Q;���R��	0&��`L�	0���o,H;�_����N�$�m]S��F/3duIHreG!?&�!�.�K�y�>[����fu��X�L�S���q�����D5[���BEl��&�����wP�p�]&c��?!�&�3{��G�����D��"�	g
���(���9�2AJI���s�u����7|��U�Vn���-�Xe�4�<=����-@2���o������)rZ�	Z-=��*�=����\W���?_��4�H�~f�]�
w��Z�6G�����
��*W�4W�O�T@s�De���s�w��,|�K��q�s��>��{�s������/�1_��k�\E��������K1��9\12,TA:���cW�Y�u(����x@e�����Z����lS���\��zW���x{�ID��Tq.�����`��;�/�3�f�F���^����r��i`�UV���a>X�)�*;�r�p=Y����1! 
�z��eR��Xg%r���`L�	0&��@>'`���Yq��X����j2�^�	t�X���;4�	�xt�&&��D�Jp7�A���������r���Z����'�������@���i�CO!~�5���KL�8�1$c�x���%�b���q�@�����*t��j�p�0a,(VnW�^��o��E��h�;�#[�Br�0C����.���8��=�k�p$A�N��P�P�.���
�'��+��T�����n�AU8����[c�O��J��� 3�b@(���+�
�2����fg?$��^g(p���V}�:EZ�"���,�@����=�E)\��G�w{E��8���
c�#Hgw���������`hs���=���7
�� 9�k�1x���|���7���a�w0���\����R�����n#��4*�l�2�g������p3*�U�����S�&��HNHypwn���lT�2.���A�MD��������:	�yID��w�����.�E�T�����q>h�K�d�&����(_�
J�1��|��P��eg���Z��&�� ��!s����g;9��_����b(W��.�� ���p(S���nze�2�S��W#q3>	e+V�gb^,��|��`L�	0&��@N	�n,0�It�t�pT=sn��1B���<�����K��@t�������������|t��5�����UU����w��Q�d�	0��\������s�#��$=��S��:F�m�72��}cN��
�)��Si�g�U��R��,��s;����!�9k"�o,:�q���2��e E^|]^�q�������gx"���Z� ��6�ZtY�F#��f����'|���2�������������zS��~E�����aT��	������k�3��#C�~���
}��~h�����(:�^�$#Af���S(�f�9%��<��l�P�b����
����>�W�s0��
W�GxZT;�/-�DwI�L��=�t�b0{p�'�_xQ������)N��h)z����>B�Y��y*b/�����0*�.�2�K��q?��n���Zl�<5/����Bq%S��d�L�U����B�!�H �I/AE�"�{P9yY9��QfI��T�'��z�q�I�?��J\�8k��=l�_���W�����o������0�0j����i��e�E��C���'Q�O�|"���H&-��	0&��`L�	0�,���!v�����U1�fE,�����C��b�<�{��������DU�m��������7�F95���/��0
�Fa���xw�	}���{�%������^��}����z�����/E��]�4�
4��B�"&i��:�G�w{���w��!:���JS��^�������]�0qu��M*�V���������i�~i�py�/�3h��a^����{c����������>�#C���Cf���WG]�F]�������y�&\�J��#.���T�J6i��������j�Q��(����@�i�#��zVA�06��$$*>i*���F�������������������71���%`���}����d���C��6��9�^�@ha����!J�D�! ��	C�]Ix���u����Y�/�I�����Z���b�dZ~r�'���&��R*����"p�D�O,�)�cy�����������*�J����0�o�v�����
��fN��������_����X~��O��5�I�njgL|��tm�j.q�������.�}��z����
0�cU�L>>��d((N����G��e�k`f��hU�
f��o0&��`L�	0�l}�\v������d��f{�<����������"j���X6�����s���X���M��w�8�+\	$��W�;���:��4��0�2u�4�����B���F����X
�LH����d�V�[�'��G��J#��C����x��q�L�=���� ��������#�joY�qB����a���&�
�Z���,�$6,~W��Q��s��K�%��=Z���Yb $%F��T���:�����8���+/��{]��f�/�x���)n{��b4Rf�Ly�x)=����x,��>�5(�����k��(�hVE��2"F����D\~����@��Z�=k(���G��%�����y*���y�\=��rV��/�1a�����4����3-g���o�f������F=m,��S9s����������(�v���F5�3����q�\:
�)���]q��"��~�����qp �U^������&��Gs?��)F���	���g{�+:I��S�khk��@O��O3b2<�#��h�~=z�wu�6m�L1��m*{�p��"sL�vxc�rM-��`L�	0&���Ey`,������E#�:#�����qbd�$}=A�<[���11��A��S��i��z��4C��=�En���w��\�xT�Mpz�;��J����c�U����n)S��4����N�g�b��a�YH��_/�|xG6����T��>+9�h�5���}N�������C�H��h��
e�����MDG�!���ePA}����: :���jw�E���S���F�;�D3���J�7�4+��G��w�
���&��F"m�8b��gNA�&��NK��R�V	�$Mu�����)��>���i�|�F���$�-�U�hm���	�C�������qc�j�_2:�:�v���jh��E~���nL#�W��<������F?���]��}lJ���
��s�y
I�I�U(N�{�Q��s���C��z���S�n��i�\���}��_�������������_���� *�.���]Jb��7�f��2
��������?�u�:8�ZU=1ct/�������`L�	0&����\6Pg����zS����/���
�`��K�v�z��J���)i:K�8��Q�����+�Y}���cZN.-�e^��i��?��4��6��B��-_TO��G���;�����y��8Ui�`�\9�|���^��Ti������i���m?pr�g�
p������$�'x#h����Fa�Wq�F��OI�Y;���Z����b����J��(N���+�C}����t��P����>{���F�=����%z�8I��n��L
�?'�;��F�7(�v8�����[��t�R����@yB��l�im{��kV$��)xD�~t��d���0b�[M�BQ*0����X�J�[�	I'�KF�]��=w��ujS����;v��1�A�y���'�EpQ�lR�D682V�q*�
dV�����]���i����/��l�G�%���r~�x�/����'q��.�����Uoh�a���S��Ix���������431�,�f����q����t)�O]����ah���:��Z���`L�	0&��w�l,�@�
Z'O}�
����S�c�2�mm��mK�������i,wx������L��h��xP�(�?q@�~o��,f��uQa-Y?��4Rz����wG��U�O�r�����J_�	�y�X�$dI����u��a�;���n1v�_1��:|n�%Q�!J�, ��xr���BS������'��Pb��.E����k���E��l��c[7,;H�R�a�7��3��V�d�������O�za�y��}?����4@)��3�d���|���HwO`�z�?U�F����O��B���@���+�'C������(�I��A�-<���n�DCw��T}����d(���XM�\$�"�~�8&&+�m���:�g��4C"����_���-��P�N��yS�b���z~���gc`)�����s�i���ke����b��-�.U��,h�������S�s"i�E��bo����c��H����1�����C����
�#�����{�lX�{�
vL�	0&��`L 	�����n"�2uZ��VEsU>N7��=���������>Z��tN��^Y<�I��=�y�b�I�kW���)�ag����a�9�-�2�A_���:H�|�S�J���3��87�I�+.��"���05���c~���Y�W���&!��x��xD���#G������r��O�Dg�u���&��0�~���X�b�$����h\U����6����)�� >��t�&��W?m9����d��c����*�{�EK�c<��W��O�O� ,�Sj��a~�T�����G1�}u�"��Q��/�M��H?�9������k����V�l��g*�,;�����'p|;u����S�E��!J�_�����A��%�&uWCz|�h�����O��ext���\�(���K6@L�0W�x��O�(���w���:�V%Z�q�,&)W�����r"�5�A=��d��d }�\�W���G����t���,fM���_���F=#a�[� s^?2�n�7�e|��)*K����q�~
t�����D!��	��}��$���?��'�b�w�h_�����:Ri�+����_��wSQ�q]4)��]����;��gS��f������)p%YB�.uQ)>R6��#K�TjM\���`L�	0&��@�	d�X`�4(z{*'�����F����%�=�����I�"��S�'��i��OT�v!�����������TQ�J�0O;��T�'H��9�0<��]�����,���~|�K��o���k������J#C�|L����5��K��
��}�O�s,�L��vuX���0n���$%b���e��c�$�W�.�_���~��"!��l��TQ����8���#�Ye>WU}^�C�����_�0�3��DE���v��l��Z��x�p���D�����?�K��a$���������+;K��M���!�~�QBh?$X�<@	B�Q�������s��+���dXp�{�F`��0��PK������w��X�5uMN���_�
!�Bg;���!�V `��7���>�L|�������p{A}l�[�ao���h�F�R���-T�,hV7x�{���J~T�r0X��<>_w��)q(-_~+��^4M�{��43�f@�fO�"���������~�N����+��	���"����lx�]����x�4b<B�Di��{�n���	���`��"r�f�N|����|�5V��#�z�l,�m�_����I�E�P-��`L�	0&���-*�\n	��IF���H���J���������#9!)	ptv��M��E�&����0bj�5G��xR����}L������s"���z��F/��a2��@iH#�m�h��Q@���� 15�
�g�D�>�F���6���JiNM��:����������8d8G����X�Ot��Hy����P�>1#��S7y�A�X�����_�~2��B�����*W�g����8�$����R�WN�F^����y�i}4�X{��z��t	�Lg�&�V�C��=�����(�����d�Y����`L�	0&��0M ���#��L �P�4��fN|��,�/l�w�Y?&��`L�	0&��@.��%9,�	i)J:h�(�#m�
&��`L�	0&������g����(`����{�@�SE��}������eL�	0&��`L�	��/C�?��`L�	0&��`L���e�.K9AL�	0&��`L�	0&�rF��9����`L�	0&��`L�	:l,(tY�	bL�	0&��`L�	0&�3l,�?��`L�	0&��`L��`cA��RN`L�	0&��`L�	0��`cA��qh&��`L�	0&��`��
]�r��`L�	0&��`L�	��r��C3&��`L�	0&��(t�XP�����`L�	0&��`L g�X�3~�	0&��`L�	0&��@�#���B��� &��`L�	0&��`9#�������L�	0&��`L�	0&�
6�,�1&��`L�	0&���6���fL�	0&��`L�	0&P�����e)'�	0&��`L�	0&��@��� g�84`L�	0&��`L�	0�BG�hVRt����xg�L�	0&��`L�	�z5=�`L�@��*�XY&��`L�	0&��`yO K34���8k�����N�����i��>K\f�%}��	0��M����M�����X��)dL�P���2[9QL�	0&��`L�	0&��O���g�!�`L�	0&��`L�	Jl,(����bL�	0&��`L�	0&�}l,�>;��`L�	0&��`L�P`cA��VN`L�	0&��`L�	0��`cA��qH&��`L�	0&��`��
e�r��`L�	0&��`L�	d�@����L�4���Wq.�&�]��i#O����UU
�����${Tn���������gQf���?�d�����!LQ��9��I�T��P6���0&`������L�	0&����� |���|(Qc���f1u��%��W�q�V*����������Z"w9WC�}�X���������O������+6�3�	36��=�w3&y��R����^r5�A���������;H�;OE��R�W����h��f�?�O����������tM�)ZE��#J�=y����R��1mb�g�T�AA/3qv`��r�fm�M\U�������2s>����T�&e*CF,\�vz����K�� �V*DG�1�Vr��o1&�_�]?�Mk�~�=�*\+��{C'��f�����wUV������g#�����v���}�q����a�L�	0��G w�!P�����}Qa0����H��?/�����-x���/�������_0�r6�r"h�d���7�L$)��~:�����A"�<I���wp!�G�?�g��x����4E_�1��b���~,���s
�b�(1��	��c�I�HIME��d)��/"��88P*�O���.&����w6Q��NL�G��5���+�+��`i�yq?����?�Z��<�nc���1��8��D?5m9"&��\=�C>��?���P�B�J��B�����;��^~,���zWeMh��{^� Z�FT���s'
���x#>bL�	0�rmf��� ��dri�|�2~#�)#�Rq$��[�L}�����/��(_�e]5���81�KN�7� &���5�Y���o!2�M.�JU����a�V"��Q���X����aa��9��t+��DF*N���y�����H������J����.�J�R5TR���G
$W�i�5D��D�*WW���x\+UA5w���W��P�������F�'"M��YT�w��CQ���?���n#�������i�$������x@���P���t�c�	�%�������G(&9�U� ��������~G/���?��4�'���- ����x��'&��@q�&F��%��c5������y�&�VF�j�u�-,��4���gY.TG�1�
6�����U���X*��*^
U��h�� �J�	i	wp)�.��V@���(�g?H�<3�
eQ�?:x5�*��r�>���]d���kX1z�GS#v��m�]��`�sGX�?��b��s��Y{J[%���������@�]���M<]
�[�m*37o����z���^��"*�bTu����(Z�4*����3{�r�Dy�_�kx�/��b�b��	E�r�|�$x����?��G����K���%e��g����������)�U����{H�/E�xK���C��*�Td��S9�j��&���g��c8�,�R.T?R����}�������kF:�|L�����,���5�V,�kt�\�R���{��A��J'S�FU�~�Cg\��q��J^�Q�Eubr�R�&��khS�L�	0��K ����@q�^!��t|��t�u!<L�I���qk5���
m'c��v(��	���U�i����{%'��N��k��2��YG�����0wS��m�R,_;�6C?|f@@��(��o���s��/�M�?0�����O�������W��,��'�w��bF���MJD`���p�dJ����i"���#���hXf�Z��nZ�S��^�����5�@IDAT�#v��9-,^OK%�Z~4��y�����K��?��
��b���2Ld?��(K/D2�-��ct �a�{��1N���fR-; '�l����c���p�fm�ZL���-�<
��P/e6�^Spr�"�����a���JG5�_x���s���e�J�hQ]����r��P��������������T���gJ��<����;���S7���n�J�k�N�*��$b��az>�VT�LT�)7����V��~*�l8�\�hJKHC���r���y�p|�:�����ce�T�u_V�O���A^��E��r9q�������r����:�_x%���j�'��X�3��u,>����L�/���g�P�=C�z����b��}�f��FF��z�0=�|�������Y��'�jC����m'���#cq���ck~����p���0k�N�lg�]%Y����zz)@��5���)8��"��$�N�k&���8B��t�z4���;a�:��D����������f^R��zm��j�upn�y��Q_,��V���64W�m�s�l����.�d�$P�[:�����;&����xj����i�#����Q��2W����������m(��ll�2`r�L^��	0&��	��2u*SE���7�3�1wY���+��WZvB�f�1���0��;����c��n`:�z���,u������Y6���$zv������S����uZCA����p� n�c������L����L_���Z+������=_�V
2��	���6bz��W����N�
:N����Q��|�5A�����D���{��c��������a"J����jCA�N��l&������������b�Cc(po5��2�"�R6�R����B!�wE�Z*�Z�v@��[b��E�5��"�X�vR�A�V����@�2U��X��;l�!�k(m>�)]O��?M7ru���+]��������������^�YL�W?;1d�r���|_��X �uL'�
��@��w�U8�`���������sVf�=���.�����k��8s�k��#tn?����(]�����=Fc��7�����_�j��$�ELY��*�Q��v~��7�W
C�NY+����p����(��������5:P%%�'=P2T�0g�W���jCA���0g�*����E.v
�`�gm)	����f��������p�qI6���>����U?��V`���d;��^�C��TO�2�W|��5d(�t�����u�1qX_��;�u�0��gL �H�wWyP�_CK�\�������*H1j��g����|a��w�0Z����PQe-�3����y+��a��p��O�}��1�3Mm(h�{&��
��s�^�������P@���b����wk����	��J����^�������7���>K�lz����b[!8FFn�^������V�|JKF./~8+�n��I��J�OCA�A3�=}h2,�cG0��]���L^�u
�tF=|����G��^�b���P ����@a @]�\t��c�2��?�/��#��/���f�T./a��/i�g���uG7�/������&	C�h4�>�WG�sl��{9L�����+��y
g����:����G����v�D��i�Ve ��NQ���+p-u��<�>|�k^]�"�������H�~��"\�����	.AX�`�ze��~��C��.y�3�nC��"c��a��O^�Q�f�7�m�x��o���D�����:}W��B4�n�����U��#�C�q
Q4����#r�b����|�������0��'<�b� w�S3���=I'��(j�:�Hb���8�\b`����*z������R8���d�p���@��Z�������������tJ�V��T����N��3����@o��=����q��a?M}���)���Z�UGb���xB�������My�
��;'e����,�%�-��C�D&������0�EYY�
{b|C]"�^�����P�9���$���%�{�����h��-��#��1B�E>�v�ZY�PF���#�/�#������M��5���V\b<]q������J*4q���/���3�\P����L�2�y4�D*L����i:v!z����#�����@����$`�1�x&#��o+�FR�R�N�8Iw#�`����F+���	T�CNQ��;��b�����,KJ���#y)��g!?������~�j]�&�i\��0T��3����W��e/b��x,U�G�kP���V��4�	2��AS�/�4U?E(\����P	/����/Q})��=�/+��^��OO��o�J:��YB17 ���%6��V�
�jL�cL�k���,-Q�P�
��Rz��S]��t��)�����q>�h��g\��G�����6��������^m�
�b#&��(Lr�X�&�l���GS��4�]���
l2p����e�������w3-U�h����je���T����*����� ~�U	��g��o���h�SIJ����?�kH@���y ��U��J�R�c���j%O��
��uLe�����sK�������$�(�o��k�r�L��-C�"0N���73;S��\l���+n�G=�Js"~�|�>�D$$��z���79��%��l���"
�P��\�UF�(��]�M�# �Hu$�z1����<;�64���	�_[����2��g�zw����S�����
�W��}��h
y��(��3�?A���S�����o�Ze�p�Rf@{`��,�n�����Hm"#��y]���P=���X��z��1�!�M��������Dv�Z���G*���{�e������&�F��r+Q����R�������d����2�A�
Q�C�S��em�_��q�/�*jX�\f����+��J�4]H$�D���*�*B������K�-��wf
�q��HL
�����H���h�h��Ru�,>`����5^ ��0��1�����hq2��W��g�Z-wmxW)����:����>Y�������E,;z|�<t��K)�~���TK��DCj
!�^������
�ND�}z��4�=��(�S����u���@��t'�'��F4��o��5�����(�v��QTME���x*��T9: ����F���&�x���x��l�{r�f~�Mve�y�X|�Li`�/`�@�@S�&��A�h��H�Se�*�few3�j�������X�7\���rj��+u�\��k�M8IX������;�1]X�i�x�=�y�`���o��	����dC����J�RiM�W�VW|��Sp!K�c{GT�Q_y�R������@H�"�p]�]=m[�#�#9��!���J���8�1�{�n-W���]/R����t9[�d��J8�r]�E�o�
��K�R4�1�&a�����yj�e7�Y��h}������/�
�A�nj����do�*�~J�7
��HV7ku��N��iO��y��e>�rXfl}�#�]�����+��7�?b8�q!��LS���������b�)BmdG��5rV�bm7
����;��Q����3����)������Ne�t|�Y�?��i�.aIb��b��n�K�]�����������?��W;q#&��c���V�m j:4_e��@�*��>KV���������0Z���P���Z1F�g�]e��N<i$7k�t��
�fv�����cb��Q�+�,�����o��^��K��%��R�f�
��U�I�'��J��t,����:3y58�d������a��e8E�
������g�(>aL�@�c!)�t��)��]g�+E�o��3�zt:�2=>�������W� �*�K��h��m�E����������������I	���J������^�g$�V((��|����*5���h]t���j�BVM�ay���Ifb"�\��Mm��/P��!��W�BWq��U�FQ^����h
yK9��bf�e��A�qp��J�����
�'c�+\;����A�]w�n����^C|� ES�iC��6�3�����m{�����	��P�.E#��hv���bt:�L$-�������k�2.pq�]��]�38�c��+6��G��<�bc�/���,E���w�r�r��
1SCO�u"C����)����@��1=���04<M�.��.���TU'�L�e.�^35|�������}[wvbsP'Z&QI���mY�]b���t�|����6����E6�����d�:@����=�SaJ����$��jT���U}Q�pi3+���9�|�<p�O�Y4�G�,a�{��@���
�B/���*�'��*��v�@�>�w�5>���]U�]LcP���:�=?��b#��82tH%1��e5*���Q���\��A|R9��ig����F����1a�]u�Y�z[AJ�cgS�R��M3������k�8���Bc�U����������I��`�8��x6l�+������y,eL�QZ�6PY������`�@��6��"��Dy�vq.:U�J{����&{�����2��9 n������'^��7x"��iW�B�����bR�F9����yt�k.�������a�Q_�w��I��`h|�)�E|4%����'�.�l�k*:?�A�>�$r�,�8�o���/���bepC��<5������ezp_x�����e��i=��oWl�"��S'���%l� :����������
���� �]SxM�B7�f�����m����kM�/3P��b�H=��_��%��Sb���b���*j�=��^�(�!H���,��3��G�r�NV���Q���4>��B�0t�0���k����P��`E��2Q��Z�egS��8�=��H��������>����}���J:��Z�ky�<����d�����8�=��`"M3G���>��s���^}��E����6���;`��7-�+?G~��-z����"���8��5������#D'?7��u�M����C���"��5��`����c�/�T���\f���e�HT	��D��\��4�A"�������D�z��I���X�l�+�]6$���@>%�X�-�l�k�>�O$~�~WZ����p��/�%(J��������m3���]���j]���,]���S�#�f��*�,��;������KO��Y��+���R���@n��/&2�~��zV����N�k<M�����B�����miW��5�*�b�J4��m��^�>��l�������$	o{��w���T��G]��A]��"T]��[O���2����r?(,�������eXF��`L�`����s*�����I�������o*��~��oh��lW�e������ ��:_������9��]����v5v�?��gN#����>����%�\�.}�L��> ]�n!���h�v�����?��(:*�v��a�#P��e����%�������kp �2������d.-W��]x������+;1���_.���:X��-����������K�p=�A��D�q�;�����+��eC�Xe(���h���M�[!�������������?�S:������V#@J��MK7"���kO�&����}�#�Uw��*/L���6�3����x@���y&��w9m��C�7�@�K�����[������}�1K����������ee�a���u;p��E������c�0o���g��/g+����\�������JKRl�z��=|P�U����~��W�����4h���
6Tt��l�T4K���5�����<�d��V5���bY�~��=u"����W�)�����6���7J�[���[��xi�u.�������o��c�3���^���/r�[�b�&��/x���#���P9�O�?���q2�2�X�e+7�
�T95��Y�
r�0�1�|D����^?a���#Onc���ZCA��#��9IoF��g��w��u��6!�<�r�e!���@��JAJ��}�nh}>�swn������s�4LZoh4�=f�^�
������uJ��R�]���^>��VpfT]���6�+I�^�*!���ES0��K�������|��_�R��e_��������D�m�
�����W����������ZT��cL�	*���	�x�����%SK�Vz���8$$$ ��.����`���		�p�M�\I>;�@�wz`�O��y.B��7�0��% �>����W7����$��UE�j��h�U<-�y��(�k��?>��y���T�d�#�WL)���3��;�B�){���~K��-�,��`p�9���]��E��i����9��z�e��gS�2I��c�����z@�!+��F���hJ%M<by���)/����-q~��S���Y��!+��Rj*=�T/��k����J_c�M�j��y���,6&5�.����9��������,��g�G���,�5�h�2�T������� �3��I|
������51����8B��S�X1��������o�
=,�M?���A�0��i���xSu��X\��O�������L�	0�A O�!X"�*F�E�g�S������ �8�u6���W��@Nl7NX��syZ�4���\\7���5K2�����|�+�
����P`���W���<-�����RRl*3$�V���V^4a���V��8�oV�����Ews~�����:���/�1(�4uz
������r�1�����,�J#c���B_���D�N5���a���=E{�X�����=4~���������p9(Vb��L�	0�gJ���i�9r&P	�Q��}�h7�I������F���I,�{h6q�����O��/��k�
�=����
$�����i�:Z�5!������h]�c�?�cL�	0��C��'�X���������3���C�Zh��<�.��;O;w�]Q���Q���_A�1��	|���`��Wp��I�
������\�
�6k���[�9u��O�������7���4��`O��"l��	��5�6}��,H����N�,x�[���_��V�	Vbz~�W:�y���-��v^f�L�	0&����ky�9gL�	0&��`L�	0&����+eL�	0&��`L�	0&Pp	�����k��`L�	0&��`L O�� O��P&��`L�	0&��`��J"g��/_��+�cL�	0&��`L@M�^M�&�
�YP����eL�	0&��`L�	0&�����D���y��Y��.�OL�3�,���2S���S���y\��g�w�G�g�f��bL�����>X&��`L�	0&��`���Uv�2L�	0&��`L�	0&��=6<�<`
�`L�	0&��`L�	�+l,�W���0&��`L�	0&��x��X����5`L�	0&��`L�	0&���� _e+��`L�	0&��`L��`c�����	0&��`L�	0&��@�"���|��`�#�p����������$��7l�Y��$G`�����L.�O9G��`L _`cA��V�	0���@~~s0_J~~XJy�Y�=�n0$]��;�$K�r���\G��`L�	�b,�����c{PTI6��O@��U���<�+}=��	<�J�V���yM��t�fI��x�������!\�]�����6���6|���0K�m`�:��)9���_P�|�����	0&��@!$�+��������o������$?�/3�1���S�*��X�L�b�����itTn��>jVfR� �@	�2
|��$Y�9�d���%6�=�J>�����mi�=GEly!�r*�X�|�����i��!X64��fQH���C�Owb��X�s��m�i�z�;�,S�cV��D����^ED���Do�|Z�)�c]4R�oV�������	0&�
:�\1T�>'W��X���x�����t��!��
��@��Q�������c��I�=�!j���z��Z����sK2��n���\���
8�h3'K2��$�@��tI.�:�V�����L#]�.XO{0:���G�7��1���H�C�uf�wq���� ��y!�Xs�+�����y���
ZT��a�f!������X���B0�E���Xx"��%I�V������v���d%�����p-�I�b=T�3�u����i)j�2���\E�}�aCQ�_~�g�|Y��p���r���`���w��5I��r�+�I����|�\�;Kn��J��x�<�'u)�,�~��-Eg���J�A��6�\k����Di��v�K�����
dY:��i�>���s�u]D1�\�a���N���]�����|�#���K<�#9c����[N�s���Gs�JQz��I#��h���Noi[`��G������_
����S����$!�A1g����z��� 0Wf$zV�v���9$%&EI��
�4�A���@�<�?)%DI�
��2}`Mf���'��:d�i�X3��&S��.m�R�/!{��)J�R0���2Y�B�F���z�cB%����gT<{������%����������J���i��/4��H����*�C[)()s��-��'���t#T��Y_�D��Y���j����&�/V��l�����H�:+��3zI~�N[yOX�iNQO��I��\�Tg'JA����[iY�S����iUfb���;M������HM���#�����m�j��~�n��0_�I���eM�|����|?/Xj?�E|,�	0&�[�A�*;M'\4�V��I�Gv�����Q\�.�7�#�GIg��*
~�p�
�R�V/�c�b�V����&���g��F�a
��#:,��*�h�.��@WM��
��N�����
F�L7�5��g�A�g�M�`�����'�\<J>w��%�EF�>����P) 0L�=,yu"��Y�6\�"���t&6�� ��^�G&�o0��Y���phZC22���������Hn���������L%�� ��uR:|n��J{��+c��dfP�g�R�u��@c���H�i��L��1�����=1J��	C�����1�&[���N_N�w�e�L����X`���Q��4��b�
X,���yCu��@�1��\���w�j��n������%T�L;�efo�Xp��[�k}CF����v=5���O�2��NF>�`A�������D���Y$�_�#�9*�����{�4�~�1`��=w�����%��)�LB�! c
<4�W��O��v�����l)_jY<���E1|����~.0�`E�x.	����+�jL3x��m�Q:�!�46�F������+t���g����Q�c�X���7������~#
t���F�z4���=����������c���0��
���F�H��X���`aB�V�1 Q�kq;��i����7�")TY����13����6w$��@iseF$��b0r{{�L"p5�
:�1�4�1$d�_'��i�
?��d�Q��#c���Z�)Fd�~���(��Z���5��;c�)�[O�uuR�����zRG]=��~M���Q��L[���):����I�[�J����qc���:k6a�c�EZ<���d�xE�5(�O�����(:���Y��Cl�X�g�11S�oW�6���EOM���O�2n�S�<*<5qh~�E�����i�}.E��
�x���k���]����j�g1�M�D2���S|�/�:�����P�L����0�6�<����K��7�Xs&���5c�h ��r��gB�2��������h*z�<�Y�X ����q`�k��D����)]%1=���Vc�V���it�����L�����G"����6L��;�8"�o�*/�����
$�z��A�R��-K�Y:��������������H�;c�|����j�4��*1B+����E�`"O��Vd	��<��,[d����Q���2�>we��Sh=��\��n���=ba������[53�(�,/�Pt�]���������=2�7�H�r#F�E��l=E�Oa,�.�]�*ed�/�*6�>LK�D�b	�Y��L�V����Ss�������K�3���i��=#}V���o�|� S,W�������R
s�^d��(E��B��P�0��K���[j�(E��JA��������&o2��2�	?����@1td�/���}���#�|^0	���`�k���$���@t��c�Ag���t�!v$&Z�.9x�!su�^�X���;r���k4x��K�P@�3e��Z<���,���f}��h��|���kiVA4���,	z����d�$���j��R?�HX��9����V�,�<�C����3���F�F��/1����]~��z�0Wf��M�&��@/��<-[�u�h22)F�i!��
Y�3���&3#&P�lH^2�h�~�&S?�H�����zSm�<��L+�1u�����������#h�@��XM�zi��C��7��&zx���#�6c�6Rz�:M�s�"���_�m��}�_���yQ�����R��i�VZ�5�[�����H����&cVX��B��5��������?J�:$��
:�<�}���&~s����{A{t�\�eu�M���+�&������:�+�GIAA������z$��0�6��g�X�ghY0`yL W����;R�,���H
��fS#�ZeyI��_�fxd4�y�F�H�nTG��K���X���U6�r���Bc����x$I���A�� m]9Jn\iGah�G�8�x���b�b�zcD�i��1�3�������@�?��k�t8@Y_*�G�v�fI�kG/y�	1�R��3~U�z�<KDb�m��=J����fezw� ��hi��H��Y����thCWu^����������@#�������#�������e�N�\F��$�=3���peos����Ls�1C���]��ps���"M�YK�XJ�u�l�05*kJh�df�X�U���1���&��'�
�=�r���M'}>=���Z1�dx�tb�X�qCY2�a�V)����:X��R����Do�|Z��K����������}j��3��T�2�axVpy����C�����a��r�X F�5��v4�Xs�~��A� 0��~�Y�>�W������X��?��V���fk�,�C��������Q���4}]c�&��1���E�����dv����[;��K����F��mM9�X���J���%4��8�k���/w7`�V���9r>��D��tdT�RO�~>�eb���7�"!b����������Ll*&��A�I],]��2Mw�,%����]c�iJ��#���X$-��GjO��Xrb��(�n�F�����v�F�\;oIF�{��i�X@�_��7W�g���Q�f���D2������|�����}3-6hQ�`��J�k��/^�.{�W��r��$$/��f|%���>���3O���x/q=B?�c>z�	wgq��"�m!=�WAq������+�R�)��{M��$y��1��gA�Z���ZY���~���_?����+3	>u*��������)��;���O�0�Z8���������
��D����IY�b��AX:s���z�at�&��r3;2���D9�:Zk��j��k�^�~���i��3f�W��e6��-���L�	0��N�h��h�c]����Yn�d*���{�tK��|���*�6��{�8JmGY5�o�+�
"��:���
�.3[��-~��f��~Xs�E�9��z�(�bV���NU�UL^k�=a���B�xY=w�+l�`L�	0&�;��X�;�H	��m��z
�����u4�9��?i��f� �r.{
����_~�v/���>�6���1&���R��FD0R<�*�*�RH���sO.����%1&��(@
���v�����0�h�W�m���Y���8���7���xJ����5H��Ks����O-��������x�T�5|w?m�6���V�9"&��`��@��Y������ZL>����8�/&�
��
G>��T���)7X&��B�.+��/`L�	0&��`L�	0&P�	�����1��	0&��`L�	0&��@��� K��3`L�	0&��`L�	0��O���?�9�L�	0&��`L�	0&��D��Y����`L�	0&��`L�	~l,(�y�)dL�	0&��`L�	0&�%��tb�b`�L�	0&��`L�9'P���sN����@A#P4;
se�j�#?:�CA���LA�-��	0���:/�9|f/_�|���`�/C(��J2&��`L�	0&��xz�X��XsLL�	0&��`L�	0&�
6�lb%�`L�	0&��`L�	<=l,xz�9&&��`L�	0&��`�
D6��L�	0&��`L�	0&��6<=�`L�	0&��`L�	0�A��"�XI&��`L�	0&��`O��k��	0&��`L�	0&��@� ����M�$`��@��^�H�9�9H[o��9�Ph�&G`�����.�O9G��`L _`cA��V�	0���@~~s0_J~~XJy�Y�=�n0$]��;�$K�r���\G��`L�	��X���n(5dO!JB����\�U�coPL����
��.�Qy6��KS��\�
{��(������gE�dm��U��<^�,�Y.*y�5���\U]�?����
)�)�RX�l��I��/(\>m�P���`L�x� %Z�t�a�����~�e�yyc���
a�=�$%?��
��Gj�8���b�do��z*��I)��M��BT�e�\tTn��>jVfR� � 7��4���dU����� ���h+���:���������L���c����g�w;L��9F�`��`�;h�E!��7]?���Bp`E�}s<"�!��I�U���L=�Y=�2E��~z��S���i]�x>�u�H-�YIcV�:�
&��(�����8�����#�x�����?�������b��H��$�r����p/��P=Y�>���`������G��T�C+�~�
�P��'\��M���
Q�n=��(���Vg�X�yfu{T/����o���D+��n[����>J����4�\u��2����F��]���`t��)��o�?cVo=���6�=��>L��*���A6t�B���FWL��9���7���\���B��Y������`h�:x����Db�K"������[�����3,�4J����{�Z����z�@3 f��vy������.3	�5Pt���6K���{���%-7K)�{L�	0&P�	HYp.]���I�%���"�w��������s���?��~���|�A1gi��P�u���!
�����IAg%�N���s����H�WB�!iT����C�!����t�F��7QZ��U��B���:��V��3sK���MG��a�Q2���*E��j��w����x����^[i�� I_���>R;{����a}�D{??6�c�'�i�)=vm������x�ON�k��R��[�t����]�mgB���t�a�nuR(n&���w����\�4���~R��:n�����MP��E�����.R�9����(i������>9P.��EJ	QA��_��{�_� Q�,3Q
���zC��F�xK�Nk�:CI�3k2����6/u��d��)J�a�	������Zj62u����Ju[o�9������hkbI�-i�xx�4������ti_���6�=�=��R;������-X�3��0���n�J>=�+�0D:kVb�\�l
��+�/�<��o��H�:+�$g����6�
�[�i�_w�x�G*��CW��{�R�
����>����x��O�2�%�i2g�M���R 5Q��o����O��u����=R����|�'I��5}�kJ��� `����L&��@.@V�����$�:Jci��n)(h��������zBx�@
N����_<$m]�k�7��X���:�u�������a@4�V������0�6jh�3�H����I�@���P)@��a����!��RxT���Kn�x�G�0�����
�F��������������+��~+D�>Q
�"�����Ak:�r�\#����[��~��� ��[��.�����g�����q'>!*T���3~��*���"M�T_��k���M��b�Xp/x��Q�z6<\:C�MtbDC�����-\��y���qhZC����J��������`�e*��Q'�������PiO�y��%�������n�hl�0�h��4�3�|�j��x+:i��*�|�p��4���P�3��p)(�<g�\����X`����-��i��l`�%�.0o��:
�3z����:��}#��0���d.�rd��.3sxK�����r^�22����d�������u�����34������H���G�s(T~7&�!o��=d1��1`��=w���������|T���Ob�?����Q@"��J�?p�mEP�m�(j�n�����&��	�J�V2�]	��	�	�jBm���I�G��J&jM�jf@���9������{3o��d&9����?��{������s�<9#�(9L����?�HL�<%�C�',�V�<'��J;�s�yX����F����i9a}�n��,�{�7�5��Q>��g�z���K������U[�{��p4@D����,���r��]��P~w�[tww��+���K*-�Q�B�-�t.�F���<�*�fIXG�8�������^�h�FN�hC�����$�u��$��i1������UE��q!�e��/�W�����&�e��X;�\���;-T~�]��s��7�u0Y����:�!��f�_�.���c?�9 �-��oz���_�nt�
$$�L�����wv=��dR�,��>O�z��c���31���1kE����f@R����x���;����1��t��K�L'�gz��M#��f�
�����*S���d�������������@�Q@4T�F�Y;!�*����2Y����]��bs��H'z��:���e�t�1X��������@F��Y��]�7)_�z0�����9���OOu�q��@g���$�����������o�������G�|���Y��e��3��N y�lP��4c�6����Gc�J�V=�z��o�ttg5"?�Pf���Q���F����O=y����c�n
E�=���[5�����g�e�rW��.��G�N��BQ��(�X�z`%�����~'#�,0�2����`0%���$`]g��j~@����F�
kK�����G4�D6�g+�zhI�]]s"��#��W(Y�@���m��D&
-�Y���d�}re:5�T���+4��jP/��S5�����V)g�]��J�������iN���n��Z�S�����4��OY�r���,Y�����j��]�'��F����qJZ���(�2-����@w�O-.��A�V�J�m��0����F�q���4]��z��F�T
����#��B�m^��x�C�""�;LSm�Gz���"�]�2��X9�(l��~)G�U:4=��hK�G�|���Y����Z3& ��	lv;;���x:�{���@e���qx7-�=����6�;�00��*]W��7�hU��C����U������('�.�FBP#������P��'����s&�C��KM��3�BJ�m���7��FI������<�y}d�Goj����@k$�/��ta�o�'`Yg0r����������	�����J9���A��%YWD:�L*����w�y�t$wX#�$���'��;����������O��+3�1�)�$�-�UF����q����W�7�����5Z����D'G�3g��j=i*=�j���b�e���y�J�O�:[����FQW����u2��g�_�ik��58-�~�]�x2�p��v��w�7��C�6M���������z�v�3��.O}h������s����or@tX~@aI�=�d���{����S�B�&�w4]x�e���]�y@�D�`� �g
�,���d�<�@IDAT�����W�A��m�X��lii��f�4�o���ztN5�p�>7n�����q5cX7��5�{;���Q ��YT]��*n�z�i�3����Q'�.�H=
�Y�.�/���z\��{%h�Z�1*��u@�.Q�$�����|���(���
�/rH=��8��v���3z{�a��K6������[[��P��Q��Q�A���55�C�h���FG�	���@�5%E�`b����_�f�ta���Q9LHN�!w�YM��O�6���lYY���K�G_H��o�#�tJ�h���m���������3z9���I4mU��|����+�|.l����GH�_`gd��IF:��x���:�dZQ!G'�9b5
��'W��1�Y���^p2���z��EZ��;=c�����W�Jh�d&�,0���/��L�����;���z�����6;����iC��s�v����y��
�^��{��������\t4����T���K*��6��+sit�����U�
~��<���G}!����P�8L�	��$9��V�����>�E��6F�����x�#����#�]-�r��~��M�2{�����.t�U[��z�h��9��NVU7���x�-�h��J#h�����[��L��u���z
��(����k��5f4|YJK=rf^�}��	�2���X>��vc�CS���#e�9�J{wMD}�t�����4��9��HT�:��Mo&��!���\N=�L�+��c��c��%����8�N��Lg��x�����,�&��-���JQU�L����:�v���S�.\�Zl�*s�^m9w>����}�i�,��s�6�}f������%�|����~�>�$����N��:	�fL �d��N��x��Qt��cm���e�����a�q!�$C�����#�on^x�����/�����/���U�2�!t$�|=������=�RQ	Y�)��2��t���g��$J ^���(���$�Y�D�������� TO:N>�6|kz*�~;2�~��f}>W�J����j��f���0��������)X��C����Y����;�M�������n�1�t����w���Iv������|90p�~6��}&��@��I�"���d���t�!�{����������pDT�����u�,l`��h8PN:�	IohFgY�IJ:��mR��)"��>r��^���q��3E���|>y�\�5���[��3�;?�x~;`.2S�l���bL�	0&��0��;2���u3��s\<�:'��Gaj���'�^i@;{��~8<b,>�5���e��d<�XF���`e�����P@0�s�3
"�u�9�<f\?���%1&��� �,�(����	��fU�����&���8�h�w�7K-.�)&��@�\�S�C8>!Ic��d����'?x.d<0���PS7�Ng��C�.�,0&������Y�'-8������z�$�uf�%g�	0���I��Y�.��@�N#]X&��`L�	0&��`L 
�� 
�U`L�	0&��`L�	0&�N�Y�N���0&��`L�	0&��H�,H�B`�`L�	0&��`L�	�v�Si�.L�	0&��`L�	0&���;��X&��`L�	0&��`�D�O�NL��.L�	0&��`L �	L�tl����1&����9<���CPi�������*q�I�b��H*~�%'CeL�	0��$��2��Xi&��`L�	0&��`�#�����e�L�	0&��`L�	0&�2�;2��Xi&��`L�	0&��`�#�����e�L�	0&��`L�	0&�2�;2��Xi&��`L�	0&��`�#�����e�L�	0&��`L�	0&�2�;2��Xi&��`L�	0&��`�#�����e�L�	0&��`L�	0&�2�;2��Xi&��`L�	0&��`�#�tgA�u+����|�S�%�����{F>�����n��Y��3&� ���UU�����s���E&��`L�	0&���CIw��I��t�B��:���"�v(U��g���:~�o��OY�����3���T�����!�;1sr�������R���*d����>_lv�>t�}&��	�!��0������A*��N�Og���JXO����A�ED�T��H��0zZ$���`L�	0&����,������]=����e��?r��J~����#g��t��[�L����<�^zR]�����v��0-fN��U?~^~���T���/Fh��p���0��q�?�~58#u%Sa�e�a��s�`b��H�����0�����Z���[�����e[�E_ Z��3�����a*��L(��Ahi�
O�����$�gtr�2��;��U�#���Ic6;p�Bf�V�'R�gt*|�	0&��`�C@$������>��W,�<�7b�(�X"Z���%��
%�����D~n�(��T�v��B���L^���YtO���Bu7�[W��7���
����^4�;L2��E���F:����n-bH�u5��r�ln���b[X:>Q�B]#=I}o;p���!�7�o�^���q��L�o��q��|�.{����KGI=��%�sT���������q��eV'J _���b+�_c]���k�N��s��Z*�M�7m9`d+����L��/�.�D�k���)JO�K�+������v�#�?����e_m(OJE������Ug6N�g��$A��{���W
2���~_��!z|��n�q�j���(J�Q������@D����%�5����U�=��`��d��4]��bG�gK���fU���g��+\5�hH���Nf
~����h��L-u���-@�z���d����bG�z^SY���(|V�U�'3�2J�����_4l�)�-����C�I�g^2����H v�y8�<3&�) Ec?�����(j7a���'.���^��L5V�mo��j�x
hB<_5Gykj�xlo7��9�%��K�u�!�����@������b�D������>/6h�Jy�T����Hp���>)��k�4x��u�6�k�J�1[�15���AG=����R��ox>���\Y$�-��Vx<{�
4�g��!���U��soCc����`������z}{G�,�)J9Zv:���D�$eH����:�/(P:=���:���D�~:��r��Yt{%_���@�\k8��j��&g��U5��U*�~�\��K����LS�TU��T�^W����L��p
�Sj}������Z�@=
��W���4�������H:��c��U��v%M�C�	fN_�O!�[���8�!Z�Z���������3=��$���w��`3�]��I����(z����+����,�,Wz.>����/x��b=��/�%G�������	0�L!�Dg��ej,a�����m��b�a�	�./�#@���%���������4Q^����E��i��hm�w9:"�a���eA�8��F�m��%����n�#��z�W��:D��M4lO����� {�]��������0����c�����+ Zv*ce���;�Bv�CN]^�o}��YD��@�n�
I!}���z��#���@2;�H���e�@�f�=$}����3��DC����%�����/����m�0~Q��mc@4T(gN�g�9:#@��uo��F��Qf  V������i���0)h�6kw�vh������Lz6Ry��)��b���^�dZ���)�T�IJ��b�G�y��y������Y��%��3��K ��i�Y:��,o0�X��y��&gMc��������W�N��Y���F�O�B:����X �}6�?��G�q�z��TF��+��1,?��V�F�=_j\��r������X����Fd�~9����U:�aY�Me�b�������&7d��������Z|��q��y�!����w�lt$����jO��.^�[4�T���q$K�t%4�����B��{r���������8�F�����>^o�yI���V5r'2}:�s�t�7S�����X�����D���Zc4�����=t��2�VF��3�2B��Uy������z���%@�yc�$���d�dYL�	$��;�0L2�-rA��������� �w�������Y��,x<b,�{<��6_As�����B�4J�6T�����4 GAt��i��)a��>��A����~]�HgAxY���Y
���6]��&9Q=�a!�tC?���j��1;7�~i�{�5����yAeH���^(�U�����gWg�s�T�rd9�&���vU'��M��_<�9����V����*4�}���9��~w��KS��kW����tw����U��������n+-B,zp�U�6U��F�H\��2C!��0�AQ8i�����9������Nf�e�D�9�y�^������^ut�;����~j	�=�R�*K��Y0�K����6��9:�n�q{E�k���r5{D����G�z���^���Buq�9��U�^@��P� ��dNCh�9������!�>)�p�F��kn\x��##����E��O�Rgj���A�%_��������u���FEP���S���p��I�����L��e�������f�\K���h��T�.�\����5�].XM�p�?���m������c`�u������i-���jj��zSS�z�����^mqB��T�SO2����{E�z�����zdP�z��rK��b�{�*ojQ7��8S���k�����1���NF�B�y���Xgx��,��H�����}/�A��U�4
e���v	��7�<r�
��S�,��;�hM;3I:�0����QK�����8�Mj�R2�t�n�0��`-9d�L�����V�#eZ�!��y�g�$�
:�
�!��{���>5�����y��/����Y��e�2&`M I�5��~`#?�q�z�kd�^z��/�k���tO�i�22�[�G���y���=�+��c���7M_q]�q����J����jQR�I9��Q�!��3�q�������6���j��s�����Q�;G��|�{������0��65D<r�|?����Oa�p'�9��^k�%�5�u���z���G��������)$������)��6�9.����s�w�
9��o��07�4z��Z�@��9Tz�:��XrY���L�	0�A&`��d�8��%����-;��	wY���>*�N�t��(}�c���Bn	AGa��Q:����1���L*�����������Nz��/]����+����=)N(�n	r�b%�����W�W^�!��7_�1)�3��b�H��xu��}�$L�����`��{�9���������Mi�.`L r�!$�2�p���6�)������s=<��^�z�
z/8�JX:#Z#=j��p���
��1N4w���������]7r2�'CF\E�A'���8�L�	0&��`L c������������
���������]��J���������Pp�Y���������RG�ky|�)�\��g�9
���`L�	0&��`L E�������~!+&�'�vP��n=�KL)|�	0&��`L�	0&����%0��M�
����1&��`L�	0&����
'��`L�	0&��`L�	dvdLQ��L�	0&��`L�	0&��;�3���`L�	0&��`L c�� c��eL�	0&��`L�	0&00�Y00�9&��`L�	0&��`C��ST�(`L�	0&��`L�	0��!�%ps����G��pL�	0&��`L�	h&_:�Y0&�2�@N_���]_��G���|x/����A��LF+��@��3/I Y�A�;����@��iV`�.`L�	0&��`L�	0�T`gA�	�|&��`L�	0&��`F��V`�.`L�	0&��`L�	0�T`gA�	�|&��`L�	0&��`F��V`�.`L�	0&��`L�	0�T`gA�	�|&��`L�	0&��`F��V`�.`L�	0&��`L�	0�T`gA�	�|&��`L�	0&��`F��V`�.`!�@h�����Qz�k��`L�	0�x��Yp������}����iI 53���i�+54��������{�i�!q�+�
� /F/���$h�)2)��PFI(��`L�	0�!I =�Y��������O��;{`n���b�bs�d8�-@��w���	���C}���YF<A���)���	O��L-BW����?��qY��O\���Krr����|q���^��c��9�����z�o~n���u���2�QF�!q@&0H�~����D�Rv8Y&��`a��Y0r��N����0e�d{�gv��%�@.|��6�}�-��'�~�����~�o���$GJ�@��G`��s�����T����G���c�aq�Vx������xc'\rc%|��6x���a��}� �
���*�Ahi�
O������v2;��U����Ic6;0ld��s��b��?������W���������2�WF	UE����e��yc'���8�S�a��)��p���5{��I�A�'��`L�D@$������>���2���'n��%j�V�������G="`
����9y�:����
���5��-K
��"�-�0�BOg��q��|%'{�����d�{w��))��������:Q__/<m��L�Z����DI���Z8i���V�Z\R>����B�9=O������>����=b�e���O)yp�hML�������q~b0�e�eT4"O�z������b) K�k��zB����V�%x��$vlU��>564��ktwD����"`Ug(�.o�(�F�#T�]M��w�G��,s�~�s@/ik���������/�w�?k���ESm��zI�,*u�����`���bGy�^���(|1���2��E���8�h0(���Fa*������2�d:&y���Q[��k��4$����v�<�{����]/XP&�x�m
t���R�v�����v�|�8Q��d��s��*2&0�	@"��{���<�r�2���Xl�F]������{dG�����Zx�}�e_�j@cx�����Q4�V���x�D��,F���Ac�e�\����I!����)R��:�?���=2N���h���o�(_���a��:�)B��5�BN��	���W�3�U�vlt!���(��`"����� ?q�hj��{Dc#6��\�o<�|@x�+�����x����3��w����IO�L�������"d��@��3�������X����un�U�K�u�U��3W�z��P;�D�]���~�P���Sk���d�V�P�+��B�c�4��$����KJ�s��|���#���l��%3��Ig@u�XWY-����;t��=Z��������a�)���-����L�1�g���Zv��E�a;��);1��@G��y2oyeX;(v\����������	0��E )�BF�h�h3���^fi�������
����p�6*��GT7 �E�����z�U��z�~�R�?�����U���&���W���)��	�������{�7�)�����������n���;k�1�K��U#H&'?����&G��(gPe���#�E9�k�'�0V�oJ�w��m�	K? *�C��t�1�,k����'2���:�T��������	�����)_�h�Nd*����r1GI����|F�8��L+n�����To������0V���@���/2���
� �����1��:'�#v����#?�m��}��v�����x���L�`gA��k��;���Yp���{�2��4���9�F��x~����'��������2��~`}(��_��0k�������i*����9�Ax�� g��0�H�v���7��So��AL�7x7��((��O�"L����x2n������
(�z&L���S&^%�]/
���^�����I(
�GB'��>8�I����(���o�0A>v�o
���@1��z@������p�]����e�jTH���p=��\����{~���;����d�������N��w�'>36��	(�Z�Z��!���7���G��Of�
������D����?�
c&��Ys���t���[��~��=e��g7Vt�
]'^�_��ex��u�����3��.�c����/�����k�9��"��'������s�a���G��;���f�� ��E�`&��G~f\���gN7�w�������������{�;W�C2&��(�xKbyF�boaE������eo�S�n�+?zy�P��n���Y`�l��^��k��E {�"�%�O����Rqq�����E@=a{M����s�.u�ic��Ck%�<���nqt�J����f��c��.N~�01kG#���B`������
T�8<�@'4���u�>gc���^����$��7�m{�T�������)[(�Q�w�m������Z��4t�"�X2�n�C>�;��S\5�br������z^����Y���Nf�$	X?�488m/|���ZE^�����<��Ix�W��.O}h����[���&�	�j?gt�Xy&��<��MC����qQ��Mj��>���V�M���iH>6�W�"��hJ3���L?��f������������
lZ�l��p��\�F��M8�el3-�������Qt�L�a��~N���C���(�_.�W���5%j����j��>������_ZgA.th,`��,�a��
���wj2�v�5����	Z������Y�1�6���A����R��:��6�}�I�^�e��8�����X�Q������e���4��`6����3���u����fV����<�kem�f��D�U�D��s����S��cw���K�%N�K��'���f�Q�����u6�e��_c�N�����_��w��"�W�KgT�`��
����:���Y0d��3��<�$:
2��J���h \q�����V-rH?������x�P��<���o�QO��s�m��[�c��V*5wuhUs�s�=.��K�T�]����C��Bs
��!ChSUh������)R
]_�{.����f������W���D_�R���[�P����di�Oz�U��zq�<�a�_�w]��a�<�K)*��F�F��;��E�������3��&�be�����c2��%�����?Sgg�P,U��(�N�<<|T�|���(m[���[���IQ�"O��F��\������� ��=7��b�A5�>J��6C����s���)���������K��_*�w<����(?*�-�h��>CiE���T��:���a���
�����3/�[����c� K�2EfY��L m�3/m�b�(��<d2�aL`HH����u'���~*�
�{�X�:��l��F��`�����x�W�y<q>,��28'rl����d��W���d|�8L�	0������L��}F��d9�H����px�"X|�k�k���}�xX�L�SOF
��l�
�}��0nl���BVtL�>���U�+�S�G������Bxz�n� �i�B�8,I��Xf�O2&��`L�	0&���SIY�4�A��}/v�Vc�p����!��qW��$��l��:3�E�
0&0���7���IR<�`�4g�	AIY�1\r/����0c�eE�`L�	0&��`L�	���H��dL�	0&��`L�	0&���;��lX3&��`L�	0&��`�B�����eL�	0&��`L�	0&���Y��e��1&��`L�	0&���,��(`L�	0&��`L�	0��%����-��	0&��`L�	0&������9MYO����	0&��`L�	0����eL�	0��"��m�a�j�� /����2�EF�:���J2&�$��KHc��6�0&�axB����`L�	0&��`L ��Y�j�,�	0&��`L�	0&��@�`gA����`L�	0&��`L ��Y�j�,�	0&��`L�	0&��@�����GV�	0&��`L`��/����\�SJq�����I
dec�fVV&���2�!G��C�H9CL�	0&����0�����,����OZ@q���@gA�i�9	�a�H����,�	0&��`L`p��6��}�S�r�������s�,p��C2&�RB��a'���GXJ2�/�Ah��:�%c�D>~v�y<3\?9'�_i0���Y��!��	��=&0��Y0��9=&��@� ��������ag�@#p��Xr#t2��K����\?7'��`L ����{M\UU�y�&�\��{����8S%�5�����N��Y4��9���`��a�.����O�]���cPs�T(�������+_|��7�������>�#Q��W��
��0&�����Y ��_�n�]����o��{�2��������U�������a���}!�������������?�Vf�	�	�����P��`\��%�8��� ��q����������D)�T����*�O����9���9�6��_����B���,�����i?<��&�����CF��y2��vb�4Lt���5s~�����'o]?����#Z]���N$���t8L�	0&����Y�u��P���������e\��������g��
Wf�;�����L��gw�yc'����A��"�^� ��L���M��c����q�������\�xL>,.�
/�����%$�	��2�I>&��lk��Bf�.�����M'{��5���m�� ���=PV\ ��a�S02�5�:c��!�������c���I����	>����~�v=|������I0��;a��mJD��Gd�.�f-<��|�A,�QqxB��
���{q2��# �m���W=c%_f��)R����NU%Vz�v��~��rh���s���`LW�u������>���2���'n��%j�V�������G="`
����9y�:����
���5��v�^1��<����'�'��[4"O������b)���E"V��hI�g�����bC��S���n-l@lYR ���ZQ�r�7u�J���a���+�5����m�:����.��Z\Rw*�5�=����k�����$J��
n����n*��
^] �o<v�hc��4�+B��}�f�<Gez�-��|�.X-6m.s� c(���Xu������T+?����y���g����'����k��O�=�(�j�q��O��:D��#����o�K�����=�R��X<�*xow��Q��_${��F�����)S�������<�����+j���{T>��4�n�<��p�����b�e��]��L�����!Y�^�d�����.9�wL=������+���gv�������i
U�w����9��m����b\/\M�~����/3:�:h��Pm)��i^���j@x��G��&����}��+3�-jJ��j#�:��z�I����-.���6���^�[���<!b��x���xJ���'O
�������]�y��H������+��$���������hb#�b�k\��}�=
2	2��������'Z���?5��V�!�����d��> ��p��Ql��o�=�(E5�77�E����j\�P9�z����Z��y�j�������h�^/��u��ee�6��ia�����l����<t}����'B�4�{}{d�
&���6��m�
�<Gk���^4d��z�;z:��z@
vOG2���j��T������N�q��)?���^�G��=��Ci@J#3��3��M�S�}`����|��8�?��/S%~��F�e��3���q����)������W�;��aQT��io�Y$+O��;�d�������-��$�^|�����,�
�����=\���t�,p�'���.]�/�����*���{G�P$��]*�5�k���pG��?�Nh������\fd�X���*d�������z���3���p��O�,�����w_�G��U])�UV��&/:��7���z�[�k�&g�FQq�p�5	9#�(9L�b����"1u�������X=�����N�����'��[t���/�h(g�~6���2�2`L��@R�$���S%a?�r��l���5��L����1�M#�G(����(M��2p�uO�U�{�a���
*w���PN!�,�g��h��8���^jt�{����!���������$d�w�AR)�qC.;M��z=9@��R�
7
���F{��(W,e��2��ugA��F���d��0���I�@#N���.i=���Z�[-��^(i��m�A�J������e.����f���s�d��h�����.[C����)����3�v=��d��L��z�#���-�����o�Yk?�G5��f5r!���YL����W{�Z���M�L'�gz�����RQ6�@l��L��[��@�Gg��bQ�v���=rV�l�*n@4T��H���"Q�!�b;�C^)�����/��<N���uZ?��$����R<�4����D���-�Ud��T�(_��2=4}?�^��S�G����64.@��Y��Oq�7g��;Z����6k[|�Z��s�mx~��9�!�a(�WgA����Z����b�C���]
�$��z���^����'�G}'}��G���_���R���Dr
��?�m����4
�0�aL i�j ���)�f�>�w�4�������rs"��
k�
�HC�c��I�0�H-�t"P��l
��;�@��l������B�T	jp������y���PeP.E7���Z����y���W�������B����(�4���.��G:(����#�l���S�(?f����^-���<�q8�:���������F�
kK���>o*�����@=�$/�Y���D&�����Q'X7��p{]H���4�X=�d�}re:5�T����+4��jP/��S5�����V�6�u��g(�+�������={V/�)Xo��@��Sv�G��F����fr���aOv3N���X�o�)\�Q#���Ne����v��^�r�O-.������r�V��>=�D�ED����M2��4]��z��F�T
����#��B�m^��x�C�""��Yox����+<noh
�f��b�m�/
���_��a�M1TD��������t���G4���,�����~aYB���>=[���m�8���O���1�����\(��������J�qsM���Nrr��~�����x�B��C�{�C�����	�0&`&�4g
��Ds��c�j��w������i��H�g�rH���=!{���
�3g8U@��#�]!�&4x��z�T�8��6od��y<�Cs7��������]8���Q�3��4��j�Zd�S�y��{;�e#���P#"{v�9�jd���X����V���%}1M��G3<b��v�p99�o���.��l�s{�4�YB���]�!a.��F���eI�VBUH�G���j��+"��&}������)NT'h*Q��K<���qzMh��Z��4��0�2�cZ�N��R_e�������=&�c����6�A7�"�4�����F�1�$3=��j���b�e���y�J�����+�M����m���h�k�?|
��X�\��������sH'���u�����y_�������ev����\w�=�d����zZ���{��01�W:6�6y����>M���z��s^������Y�Q�X���|�!�������;7����(t~�����_�w!&L7���^�2�O} �ntO���������Z/{/N����5�`�z^���Z!.�"�����p��.m5������$��U�2�����4�$�f�j��s�}�������B�����!��!�u��W��r&:!>4�0�-�^�Z9
�P�q����I��y�	0�#�Dg�Z��z��E�&���m�j��������-��qs�y��G��6�����F���
"�hl�s���3j��E44v��U��+��R��0[�����H�E�.����3�{M�9B�7��m4�������.8�h C�5�t#�������J9D_��P��/�@kN�p��5|[_������=���G��v7o���2�l��7)~�S������x�N��A���%?�).���ViM	�:����5[����_LvuF/�{�7����<����n��%x���}�P�����3����IF:��x9��m�dZI�jW����N	�L�L;c,�����*MF|�]�v�H��xy�����Nn�+k%�2t
X����V�k��Ac����G?���F�:�"�q�p���dF�5�9z;��y�=���W=���}��n���������e*g��%�s���t��{���lU��C�e.��|d����>����f!>�L��h�I?"�s@���q�< ���Fn3M5�^x4�i������6�@w�h��"��2{��?uR��p�L�����kVhk�)��~.�q��A��&�K�����!:HO�+��z���0\�x<t�[Ip;�d<4��� ��)�!N) �x�'��Rp�I�1R����23�]&��@��U%����m6
��jN%�����cz{��V���-
�0 ���64��P��z����6�z(�����jc�rj���;�Mg�����s�X�#=�IW��~�y�i�������Cs`u'
�!&���M�z��*�<���k�a����O8�}]e�X��V4������*�Xu�p|�������9�D����	��������?)��8
��Qk(Y=��C���X��!D�#e2��Nw�^��1�4{C?\_��d��uk�^)�������Y��������,Ueb���-���q��2��4�mwi��w���b�d�Y[K$"��_��}�%H��'���JJ'����������v���C����<}�z�w�\��y���*����H�I���G{�uG�����L��#m�����������I�px�{��`���Q������>EB���g�1k�<�a�)R�6����>��B��}��������c�{Os^����N���w� Y���7?<|T�|���(m[���[���IQ�"O�~��\��������P0�vx�@�?9{�� ��U��cP��P5�
���wau���h�D9��9?2���+��o���N�����x$��%�Z��d�w��pG����Z�����=w@��_�gz���F�_�w]
�Gl����C���$�9�D�������� TO:N>�6|kz���.7��|���<����J�{_-���1����t�91E�cx��p4`��Up���1�;�����M0��wF�f�4�^�o��~�3�3�1:�w|p�}�/N�l��������<���y�p
��f���{�}�u��?��� ���/�����������	��g���?{�N��9~`d.���o�����C��3'L��M/�w�L��'�4d�;9��1�5���7�}�B�����<*�;Y�8��]x���B�����2��Y�?4����A��h:�����Os���'O�I��ai�e�
5�D�G� &������?�<
�B���h~�%������S�}Z���|�I����>`C '�t�������#�6g����gacj����sb7��"�4n�g�|�RW&�����I����?���s,ZU�;���������b��g���3]y�8@IDAT�u���3��+2�!��X�`d�y<�	������ :�J�>��x�������_A���*L;��Li����`�������������i���B��.Q\��C�I}p��M���q��3E���|>y�\�5���[��3�;?��Ms*�.2����3�L%���
�{���GR������U�=E��1>z�d�C�����]��/����N��[���l�(�K9�0�eh�#�C��\�����n��sNC'��m9$�}N66�>h����f���)@Y/{�w��0i������R��#G��O-R�������������$��YU��_��;���/��N<�|tG�������~����}���W����3�1��_68�_�#&���>����8��X��{
v�j����A���oo��54���#c���~���ai4�����Ep�����_3��?���/�s?��C����3�N��C#�����]���I/�"���[P������w�������]p���\�3��<8N@��
���|�����Je�	�[a(���yA0�s��5������s����sh��t��I4���
N��z�F#������i}hF<���a#�����o�\������m�U�����M�Y�0����Bt'����>�I���n�*v����j8��p"~�T
9���z��A3���p2�t����O\^E������������O|S�������2���i������Fz~����X��f������[WC�����W��QgIGANM���Q #��W�u�}s����@H�4�t����o���G(v����B}H@���p��+�C)'7�\g���JZk�8>�+pU�(�����1h�)W��9�p�e="�4u����9c�^D��������T��Bh�������K���(���<|��r<���������Es>��v2�����i~R�7�!�C������_X*�$�AV�G�_,�!��w��\N+�S�^w-�i0
����/���S���J�'~
'���j������8Nx���WG��
��{Z��p��8������
�G�XNt�m�>|}9�t�� r9�t��lXukXp'e��C�Y0 �2<nex��\g:'�����g^���H�hx��x�0&���rN1��o������������5��;��3��������x�t��Qk~%0� Jn�� Xx>��|s�@�<�c �v�?5N��"98�`4|�SC��u�������))��*����'oL�	0&��`L���w���8R��8DNE��-G��dH��n4���� WT8����e�S4��f
X��:�_�rG������z������Q��+�
h8�!^zh��#������
g�0e�������3&���Y��E�
2&��`L =H���#.�����W^��N��S��$(���}�
G���^��k8Q�4qV�4�m8j@�����`���$�>��,Z8�Xad�d����3���4}i����x��4nX$>`L �	�� ���3��`L�	0���7kg�H��>��+����^������e����L���K�3	`*.h8��E��=9��^����������>�0 cL�	d&vdf���L�	0&���7���/�	G
`O��m�y
z�#��D��� N�%��x	�e���t
L�"�+g��	����(����	0&��	����
_aL�	0&����  ��������#�)�oPN�b�2�
9���<G�C`���2`����`^_cr<&��Hv$	$�aL�	0&���`�N����>Mx�/�+	_=����z���_����3`<~�	0y":�p�:�8��rS1+��B~�H�e�����PD��"%I�P&�"�� 2&��`L #H�@�z��K����p
dAN������)���S��i���$RI4.��c����k
����	0�A!���:�<|tP��D�`L�	0&0\	���[�w� �ho�3�U�������!�����lq&��?�B�p�X����p|�%�����E�(B���B�m������T
� �9
���7&��@�F��nP�*)����{|�.oL�!�3Aq�A%@~���^|��T=��85���A�t)�3���@+Nh}Y�>��^:��	�5���O���
L�uh�@N����FhR�JX:GH�������fw"I
���"R����P�����c��`L@8u���$8��1�� �A6:
N~�#Rr�v�i\���@g'.4Hk
��/�S 1g~�ip4������!0_?8
_?XX8Hr�`L ��Y�B�,�	0&�RO�Fd������wM}�IN!�����pr*.���"�:�S��W�����+8R����%���!���	�C������uh�������1&��Hv�,�eL�	0��#���g��t�^���W�/2F9N))�xG
h
�JN�Cp���>��)p�Y���h����M�uB�����'��X`L�	$F������L�	0&�F�5
>��������Q�����*vB)w��z��Z!�7�R�$l�W�S��C�{�@�:�#�
\F#p�������U���HbL�	0&0P�Y0P�9&��#���
;$��$���o����G�M�����/�S��W�C`?9�� ��sN�4?���)��v9�y�9��A�`L�	�v�G9�L�	0&�d���-cDD����$�����q])��������.F��Q��:N%@o@�@�dp
/T�_dL�	0�� ����('��	0&�$���#$G����$����)����:�#�����3�0��h��8mr��
�����e���L�	0&�2�;2��Xw&��@:8~v6��7�Hg-Y��D�����ZS���S���x�6����i��NB�.,x��B�3�.�ic�����`L�	u�,�%��cL�	��k���Q��b����%@��u��/*��}@9��d8rF���	��Q����8m`
�mG�K�8+4�W?v��g#�O3&��>�Y0|��s�'�������g�a[;}"�����8����Yu�N��������Y�������	Y~��y|�'�?uO��~������d�����K#h�A���F
��)z����
gC�4,d��T9&O�o�:0��>���`L�	0�aL��������!p�8d�}�cc��>8~���.
�������b�����p��P� �C50�sBi�)�n�1|��(��Dn*�qu��F��0�b���[��u����/n(#��<uh�o���@����X�:.���Ls����p��u9��v�|��-=T��V&2��9�u3$!z�VftP�L:�jF���y��~�i�"w�}~cQ�C��
l!�<p�"�������.��.,�[�|�����������~�x�s'��_��/}m��7_U��cy*�$]f�cN���x��C@�P�$ �:"�,|�����)���i�N�p����|��`L�	��.��x{���}x�\��y������9��	��?����k�(��R�<+��N�6?��a��
����6��N��Z��v v^���2�@���N�`<���xV��
a���CM�m��1w��0]��z��[��/��>;J���Vz�u�W���r0�W�����N���"<�P8�k��|Q8��{^Q��'��]w�=�2z,�:������mw�6�I\�����?����?�\��k�;�dz���E�.���c-��l,�Q�>�PS*u����r����*y�`A�������W=C�`Mjp��D����*��!Y����C.A�P��t��=��_���j������{{{��~(��.qF��X����W$���
��R�K���=)2
�K��|W��]���/DS�?�(�����j�����ad��;gObV�
W�A�]
B�%�m�X��!]6�����s���`&�RL�o��k7����o-�j�Fn��R���)
��|y�0l<4��B�(�<;���wO�)jl5�K9S��6��9S���[b�st_��M?�����;�X��I����IK���V7"/��B�����'J7�_f�t�+.��S�d��5�vN������MnG�[��i�R��=�0R��I�T�@@�^Q� _r����Us�R������'�G���.W���v_��u����6���u����%�� �Z�{�{Dm�.o��7���)�h���mZYui��X`>��/\�Q�c��G�NB].���S���J����%]��[��M��dz��-����R�P����=�8 �.�8��N3,�������`��Ir�}M"�����j����������g����^zV�^t*�������v���!'��-�X���R�+�F�a*kW���B_�������\�P��r�$���>����b���8��%�Z�������l�����q�������O�O��8i�,x�Q������K����+D��t�+��9,���	�������1_��66�!0
������kB�g�O<)D��B|��QR�c��hM8��B���<Tr��`L`�H��]M��zE�G���Q�*S?��$��s�-�h��q�mv)��Q��
O}�j$c��&�h����x��O���@���j��T�-6l�so�2B���^���a�c{�h�(���N�t��+��R4��D��Y�/(��H�{���BF��M.Q�=����y��
�.4��f��>��i#���^�W@#�!c���Wa�Gg�N��3� �v��l*g��3t'�E�T�N���A��b:���"���	�9���^R������GUO�S��#�t5?`�#L0����,d`D;A�4U��0R���y��'�!R��>�Ug�J�*��R>Q�����j��e���z�`_��@�R�;�,t�%�Q���ums�;��a!�N�Dw^R=Y�q#7gz�EC����{Mu]��"q=��$]����KJ�����F���i�L��d9��e��)��)�a��T�����u�ti�8s�K�������������H�-���!�,x�o���o��!�)0j\�S�$�#��+�������]B�l�w�������@B�g^
�c���;�A!s��%�$g��s�f�4��^�Hf��m��U�W
�
Ml��F�
��m�}��P t��j�<��h#�V cR
PF5�<�n����
n��J����E���J�~�/*���C����x�i���8P��Uo���*����c���;�^X]� |�5��lj�B�
G�Qs�5'�wMo�����(����2Y�F�
���^��*��V,	9��X�j�u<i�k�M��,��V��R+��OA�GH��3�y
�I���,0�^�����k��MZ4�t�E�2���f}c��m4z��SO4m���F��Can��A�����9�'�L�6���g���������dz7���I�S>�St�T�*��ST�P����m������=��NTO'2I29����9��b���tt����i���6�NS�t�*������R����1@#�,>=�YOC gA���H'��M�����{YSS��]F��Spui�ef���o�7�	�q��V���7�b�S�@��.b�b!n��b���_kv1�O�#�yc�$���d�dYL�	$��:���)�M7��ggMy�(+Q����
�FCCKk�*�8���
K�c@K��v��5��Co��s��F�J�z�"��q.������C����U�b
��u
&�C�#���u#�'�����Ft����;XB#�{#
i��W�R@�-)��y�y�6����`i�x����d�R����kD:�z�l�o.�zd�S}����"�!��yT�QN,Z���S9#����Y;�Jsh$��&x����N�x�_��;7�-���s�n*�-������#���U��^�"3�D&��i05��",�u[�lG8�y���X�#�Za��t�f������qRk�t���lo�+5G���8eC�Uz�����@&9
qJ������],Gw��!��e@Tj������)N��v�w�r��q��V�3��D����T�~����5�Y@���Kq�	��~Da#����'2��Y@����V��G���5aN�J >=[�].D�:!~Z/�+�
\pA���Mu�7&�L�,H&M����@H��F��-������q-s��p^�y��Q�Q��Z�#� �n����f()��<�#B!NdA�?���������������\���O����pn���� �i�Q8�<��(��(���O��-�/�8�F��q1�������w���^?.�<�(��M��]{n�R4����?@-�2���;�_5����f
7��x3�F���J��Cp���|�:�q)`��`����~~���r���Oh{N��A�!�/m�=��6kYnX=���O���6���N�IZ�����d�-��l�i�B���C�}\��7x7|qb�Mg�������eLug?�|\�����5��o��,�����g��saa�v����F��������_���*g]�~p.�s��2���<�����w��d?�g���g`��f���Y���Ia.��U����Z����s�x�F��h1��~_����cP��.Xq�����p�-����=�L�wm�w���?��y�L����P��
��5����p"�c�'�I|�w��`E�x������/�����)���/{�����#M���9�N�Y�������{�;(#L���i�	����W��d��8�t[��������~_]~����0�M��$�RF�.J������w�������2�+� Z������l(���>�o�������_��)���SP0p:rJL�	0&����H�3a����[l�y��s�{l=O�j�zo*-F�T��u���k�}��:E9Sp�0��-H��B����o,�G=`�S��zm�8���XY�k���v�S��@���~�<�R��ra�v�7���%j��Zmt�1R�zp�om�E�E����Z�b:
���I7g#�i����=&������"3}=	
����Wi�����@7�RR����"u�Y {�0.-^��������,0Al�uf��R���?�b�s��zs�����kn�U^{_��
t�{�5��T��SIp!F�����X�S
��c��%�`}���u�jAN�"�KL]�`}[�TF�����$�����o�0M��]��9��n��	���~�u�u|�4_����:�O����R���Q�2����^O�a�����}�����B���
�3�c�p��4����d���e����a��m7� �uF/}��N�U��ST�?����[�QE��i�<oC��������5���g���g
q�!|N�C�9��3B����v�~�=��-�x�	����-�0&�RK )��
$���?����~�\hN�v��JQ]���K$��4��&O��1����q^���J�~c�h���8C�O
SW�"��@��u���]��
q�g{H�
_%�O��LF�����d��/3���_�9G�CO/Y������
S���F����*��5h���s�$��kZ��
\!�~�v�����^{H���F�Z�#G��C���G�L�J�ys���/X�����l���9M:�
��eE���V����Yj�hOi�����	|E��� HPQQ��9��K�U�uU.�CP�%(
,"aA �p�����$��P3�$�A*$��$�%�����������\9H��|����z��[����TU������)�Y�����M,g^�9��!H���yr:3��q9>2Y[��^����dz����"d�O�Z���dd9?-f�UL}�z��`3U�����}�����DT�pvNo6%�����Ef�8~�E�4P:����@!.n)�c@��G��
1w��c��!Q�/��aI������ ����!�@�$`�j:���#�T�v-��D8�#�H4��:%n��B��G��_���O��.���r��H��Z��r��c�.K��S��:��C���?W����V_e��:���F�	�_N{���X�0���^[^v����������i\� �/��\�z���Z=��[�)v�{���A�6���q���j(�`��aM�������?V��vj����Q�{����r�u�[������
�����t���:Hc6�
o����"dzK�:^����5��(�^[z��)���i�M���H_F�:������RIvG�M���I5����&NQ7B�?6�\@y{������Q�	�����?r'`q��W&�.�}��V�h:O���7G�l��`����������� ����yUa�+���2��l|��R�k�Qm�Ep�6�GT�K�����  P��u��@t
+kG-��&M���5�V�/����Q���x8
�&<@k�F��X
�D~-|i���M�Z�k��e4��������CD+9��U����U.�J9��{'�l�UZG�L�����VE����-�<z��>�c]��#s���6+q=i4{� =��������)���y�:|��G�M?<2�L�(~�V�L��V�]��N]�~�j��7U�|��}����C:�zT�L�:E-&�(���t
\�N6��x_��;]R��_@@@�|#p��eJ��jL�2����@/�|��<�5 ����W6�23��EY�������+X���=�6
Z�>z3�����j�,H�s ���������O�X8N�B6C��S�v��{P�D�n���}qT%|������e�!myo���JP�{W�D�F�.�������4j�"�.��K�"�v����Q��=�r�3H����D�:����+��R
k5�>���w��M|;
8���0���a���Ww[��O����0�xmjZ�?r�1�C�iL8|�NQX���C���0����^��-�����B�\K]���%�������w�A+����p�e�MzP�9���E&����g7�R���V�X�   g�@5t�LXS���~^.�
�f�0���/u�"����
u���8�t�>w�"l�G�z��^���Ol
����Wl�]��k�2$�;�g������4�_
���   �<�0�@@�7��A��%^��U��������t9?���P@@@��'p~�,8��9����`�M��E����8���T:�������	GA�0����  ���|E���P����
�U�����F�pT����   p�����/c�@�y�Wy�4�    P>�gA�p�8g�Yp�%2      �C���<E���9r,��     �$��e�|c
@��%P�=`��o���N������g���YG�A*�l^%�?G���m�h�"[ p�2�����E����BX8�Yp2�      ��� Z      �8��BFA@@@@@@ pCaA@@@@@@�< g�yP��"������C��`h!,��������,8
Y�`�Y-������X[E�	���EQ�_	�3��
a@d���     �L�\�I��hdLj9�qg�@��R����q�D:�$���<C��in�������u�6�4�^�a�_��>�=��f�8s-�B#X����<TP7������@(Wg�����[TA�Bl���v�y��UxJH��#�����4�V8�h{%�:sx�]�����n<5k���jN?��oZQ����������I��s���S.V���tU�p��dX	�5�r P.������iEQL�w&���-�q�F��`wu���������td9%�CH8�n���������p
iO�����������:��kS�����6wc�5<k�6h�i�����PJ�p�����)�}&�Q���j������L�$=�m���������W�4n���8}�X:(��L��4��g��yqvgl����3/v=E����]�s���]�����=���!?u���3�#��,����#�n+��������2
��f�X��F�b�mA�Ym(c����4�VM��c�q��R�
��������\�u~�y2�Gp}�����J����aw�Z���Y����4��t3��M	��������3�*��uD{mj�1-�\p3���~��%�s�4��������%T�zh�;tA��zx�z�z�p��K9h�3�D�&����-��
mOu��&��g��O��O�YB���Q��������5�]7L��@@@@@��D����
��yK��au\�]����!2����b�������\�����R�K��1*L�����(���&3��a�:-����0�#V��8���"n�pQ��h�]�DJ�&cB�H��i�-bL�:�
�*P�H��(������z���bg��f�l�|(!Z��m��-j�
?~�vWZ.�-O����N�k���d���*���d�u�Y�G�&�E���q�b\�p�}�f���c(�w:!
r3D\�uQ��c���^�iy��[\�U9^�^�!u����N�k2���EnA�8����nd��q����U^&;�^�V�>�d�
9?zy92Dbb��L����.,cKj�8��,��j������4�L��v��f��������!���Ltq��d%E+��#�K�Nq�}Z�!�DB�V�����
�\S�

���xpL�HX�K�i��*H�V�}��pQ�w�HN������Q�1~�e�����+��sK�����R��������!NA@�J�`��g�Vu�}���)�^�!Y���yY"9!N�8����s�������)&UL�p1x�[���b��Y��\&�Hw�U���`���+rs��D�Hsb������k���'�n�r�8Bl��pr����v:-���,yv/N�]r��OH�o�'�a�;��e<�bGG�=Uddg���C"��x�Q���N��X��d58� ^�����V:k��r��!�pY]4z�a X ��n�Xw�2c
��]6��t�H��s�@��������N-V��^���!���9���A9��:������U���gyo�f�3k� �}"g�^t�H�[Gc��uH�`d~���/�Y�����,[��-f��dt��p� ���y��D(�&���lWA@��	��2m^�a���D��N���i�i����������v4`�D��3S�.��6;����!vg���F?��ApP��	�`�"Y[������tUs������k*��|� *��5y��vmhH�}�uz,�X�z&.���-��%�4'n\������c �x�y��� �q��+�m�G����0�p���d�>��������c=>���z�
m��UW�@#GG���gy�uHWZ��1�2{��q=�c�7�A��BJ�B��4���Fa�������r�v8����;:�������QcY~������Z����@�3��?OV%�����\3=���6�^�#�n~�%��x� |	�@X���]�����h����|k���4W9���35j��z�@wLM&���t���\|���q3z��^4���H���<��#?�b��e-mia�������cb���y�W�����@K�� @����u��N������7�hd
Z7q�i��M��s��������F�{�k����I���AB��jtZ�����r�����;��$����s��8�F���P4�^���|���<���������H:-��xO�����k�����h�'�]�M����o����;+�����9'��2��������������=���b4�>��R��{�~V�w�>Zz/}��4z_�!��Q�Z_]��8��~���Q\(����5�r �Qf1��|y��~a�B7�8�Pv�OP�����f���Qz��Y���i4`�@���4���a��,��K�����2>���6�vo���/�>���E+�c��4��O�����e�)r�
�g��Ch��I�>Nw��>���4j�z��i������0��=_�������������h��}-�FF�8�@0��M�J����y��\k.�
�1TN���8���#�y
{�(pd���a4C� �@/h�����A$'�}���%�2���K"ZO�v�HJ������~y�������<�=+�.b�h{����y��4^���&gGj����t����S��:}9%qE�{��i�r��z�!�N�%�b�5���Ip&�-������	���������S�^���k��MKJh�������5�dZY�(&R��Z�yph{���Y���"9V��a�s_���x���7����]DO�������	}��)�~�.�8�#��{A����r�k�D	{�?���.b��=$.y^���DF*�e���H9�<����4��/P�,u����7��>�!�����=5���3,��{P���������21��/�y��>� A P�WiC��I�_����5r p.(�=���]����n�ab�f�So\�,��?T���
�b&���q9������BAZ�kc;9(yd��Q�\�H�c�.�p�\��������S�����Qv0�.y��Ms����x)��������fG�I��pr����]��5�z\�Sn�g�-v�	�D��~��G�LP���f27���,�u�?��r�*�#"rsHy_m��z��^��A��D�a:���v�8��8a�u���|���V�<k#=}�'�j�	�r������w�c$ ����H�DE��V1��/II��&�y������|�P2���@(��� � ����T����M�?���9���k��wOB+9e���E��Q����f�S���^tw�J}�:�-�����m	�*MS��^O�'����i\��Xc]�d��.�c���?���&���Xq<�gy���
���|�\�����SRW�<�,?��T�l}"q|
�7T 2�d?���@=1��e����������'����%����eT�E��(Z�|�B������B��p\AL'�)q4;&���E�g��4�����(� �B�<l^�(!������L���s�@�
�����k��g�(uJ���>��������+\NoZ@���@� ��(������}WH�o�T������\QZ�oC�Qu�������d[E�W]�s�)�z�zT�y�c����@9
�l9���Fn�Fzm�$�n������o�70����n�����^��[*���Y��{y�6X����*�n���S��O�Q�h9����,(s^xg�}�P��D!u�q�����agi�w|������(��@<������#}��C�m������^&e|�M[J.�H����������#�h�W?��-�>ky,s��|���]~a]<����q���O���.���Q���h��A��G����s=�E���I�b�!TOj��t�~����"�:v��w��	v�++�9+u��-Zd@��l�\*,C(>D�DUtfA%��tu���o(��J�FuUz�������T8*�+��������@�%gA�-:(      C����
�       Pm	�YPm���������@����b�B*������T[6�G�����
4<�������k�@@�Z�Yma�JC�j���N����@�jAu�Z�('�y�b\�c�N@�,C�fuA@@@@@@��	�YP��!@@@@@@�8�Y�A]�hpT4a��jF��jV`P@@@@@@*��M�A@@@@@@���������&gAE�|�f�,�fuA@@@@@@��	�YP��!�i�3�E�	��� �dfE����"T���EQN� �     P���� )���I�bY�V�:�j��������T��8r���r�(Nn���qt��%C`0�sz�ko��3����)4�����in�������u�6�4�^��_A@@@@��ru��J�EU-���T']�sP�W���F<Ca�/��%���}5���\�@&=?�(�d��.M���G���w�2;��KQ��R���fM���Q���S��$��     �@�\�9)�e�2������L�-�[h����1�n88(i�8m�����B��v�L�C��(<�=�nN���2�|�jA���e
'8�����f}���<����4����3���n�~�LW���$�<��N�����N�Xm�u����n�9��Uy�y�d1������t�����C���s���������MS��o������h�A�o�S9\f�=~�w[i
���x.��-�i���c��Aqg�������������k�^2��J?����2u�������z0h�z2��d���n�}��Uu`m�T��'l����������JwY�R2�fR���td������^����F;���u�8q�;��8���t��4Z6�=��6�l�JF��]��]S��-4�N=_8J��4���q@@@@@�	� �����8�)�����:��F;o5Cd8���k���"9N����d�t�]D��Pa�m�V�����������3�����8/I����{L[R3���d=����f��s"i��.��o����9�{8�v����E��f�HL>$�R���~�n��s5�����ec5���k�D�������bc�
��_��t
���(wS����{��B�T Vu���D���qB�X�V�_O%d9yR��U��hL������T��f1b�v�[P ������:Z(��kL�%��	*	Y�3���p�%��]�E�Ra��$��������^�X%�8?Q}���<!I$'0SNW��"U����t�7�c��Z}��A�a�����'$���T�~�0%����	�7)!NL����,va\��C��5������Xa(H�uh��1,\��KN�Re-�3�H]6@\4d���1��<aa/Q��V�z|��@U"`�NV%��K�#���\�r�A�7L��;9h��t��H9��
��
�"�P������`��qH��H�CA����{�(��Xm0�&9K���\�;��Q�����s�h�
�5UN�����Uh� (���<���u�:��d�*�)d�_J����Q�t�,J��
�������{j����Y9v����� �C_����>r����0bA�H��!���!{����9�d4���Uz���:���{w��vh����	���tn�2�ptH'�t-R�-%UW8����{�eys]���� y6;*�	���rbU�p.�1��[���ouF>����R�@IDAT'�|����sV�r4����>��E/���hL8<��+�@�"���U--�Mu"���\��]A�/5�oR�a
����N���i	4iT�m�?zq�����b�4��5}��|����w�X�7��f����*��*�c����2�i���\Wn����5��)����W|���/���j�^L
1�P�q���W�RL�
4��.������QH�����fn�a�n���v����D����;+i�M��Gs\�lM����K]B]��v����}�zNE[x��%��_'�����7��	�(���ta�JZV��x�u����AT��������k�D���x-�Ut�j��u4�mk���,:m
�Z�.�@���?�F�1�n��|���;�f'�p�"��j*uih3\�i��g_���5��{�5�q96�kG�}{~���Q�^���dgz�IN�$��@@@@@�,(�=���A��!r����������cG��Z4���8���}���O�	��5=�^���������~�%�������+�q����Wow%���}k��)���q�����uF������L��k��A_��.h��V��N���������aQ�8�-}�iw���R&e������k��z��JZB"�u���)l>7���O��������U�����K_�3��W{	h\���N��w�?;����e���n����j�<�C�i�x"�+�[Qm������]J���)G:\��]`Q�n��u����xG����QP\�.��3��|�����fW��}����.}���i��%��3����L.o����~)���zk��4m�|�\z�����     U�@0)�M������}��@�}��1��k:�>u~Dt"Oa/�l�jH������&��u9
����85�\�`��������r��%�k{K�����+�yM|jR�Z/-��R����3���������I�Z��j?^��S����j^��di�va���������:�����@N�_��k����j���N�-��S������O@�� �;����Q_�+uv�7�r�"&R[��:�r8�Y*�y��L'9v�Jg�G�&��<0�+�p���r})�YO��,���x%��/E����Rom���}�Y���	13�Rm	D�C�3[o?��U�DR�V/=��GP���Q���#����n��l�#����{���)B���X1�KH<��*���@U!���U
�Gu#���\��}A���gAq^��V��N6�u{�7kk��=�u��9_��0���X������<F���=����;k�����:���c��O5�b��R�J]N��
�t]�7!���6p�������������th��-���2�k�F��k�]�S:K�9�����y����J���U�����<�b�����s��
I�F�����#�g5;(��DZ�kSC]�����p����e�{����!Y�3n�6/T2��������������&����H6���8q���.zO(Y�����DE��V1�5}�SoRq@*��/�WyZ!��L���\z��o6��@';|s��
��es�Q�b~�9���\���}9�<�"�.��a|�����|��4d"<?A�������L�!_��a���5���)|P���!�Mu_���g��C�|��!���y��#Y~|Wa��+��_�I�������w�+g�����w���j
�/%��3�)q4;&���E�g��4�s�2���hM�1
���~r
u�%��@� ���U
-�Eu"p��:e
���*�Yp^���L��4Z8�1���6=�v|\F�����4�Tj�w������{8����o%^������#��!P�oC���TG�jj�m]���[e�^L?�;��/�*�E�����4xh]�������&�k���'t8��,��a�������\��oG��5F~2����r&gA9U���R�.M+B2d�������@��Q�) �jE��jU\P@@@@@@*���)�������@�"gA�*.(      O���g�@@@@@@@�Z���Z���'gA�3F
       P���j���c�E8'�v-���T+�YP����������@�jfA���@@@@@@@*�fTv	 }�b�,�bu@@@@@@@��	�YP�%��A@@@@@@���������&gAe���*F��*V P@@@@@@*���]H@@@@@@�8�X�@�lpTv	 }�b�,�bu@@@@@@@��	�YP�%��A@@@@@@���������&gAe���*F��*V P@@@@@@*���]H@@@@@@�8�X�@�lpTv	 }�b�,�bu@@@@@@@��	�YP�%��A@@@@@@��������4�qT1Z���������r�zn��s�<�wn�)=-��q�T.Et������O))I�3%�rr�%�T��UAN�.(���R�\�J���c���cg�6��8������<�|��@L8�l��`��H\Fq��������G�;���AM������0�:\����q�"��Y�*�����4r�	{��0�1�v��M[~-u�����tsP���������������SPP��,������`�6�U�UYoaEe�r���q��q�i�ZA�D-G{L�e
[�mI���Rz��ylC�fC�m��6�iO~���P������@����9(�v)�{�C�����*s���Rf�����j*����M=����S_Y�����|��ZL��OP=��R��3EZ��'/�G_�N��N�)]*���{�l���t�����f��	
���ig�J���2�U>x>��WQ������K��V����X+IA+{�X�A�����K��?�����G�3��C�/��;���ItQr�B��B����'=�t��!
�����`��}�r*oF� �)����Qp��Z��A}z?��r����+��vY����
6HE7�s�yv�t�R��t
���������V�Q���W�C��@C���q�S�
�p�4�Zp���)������U�!��:����,���YyT$c{ ��i`��4eSVE&��]yv�6��4��c&.M���
h]����M^8/���W.�%�^����mmp,�i�G����2U���(j�J�>if����>U�^�������!���V��-�6�z�+T	�t�Z;#s�a�+4��Px����$+A��@"L�X���AK��Y�NK�����Gu�Qv����R�f
K���N�K�#/O
�6,�7�=�U�(7;�l<�n����{HW���NI�$N����R}���N�����o��<�3��`�:�r�����)O����q8a��E�2�3���v�!mu��0[�fT"	=)�O9U����<������ya�
�8E,��_/��E��U����dy��FG
�5s����/e��oq���.h��/R�\�<KJ���2�g9�@?lEr��))���D=d�?w��Ok�/���u.��
�)y����^����3�?���;�����V��e����x�'J�uW�<������r��~F��P��`3<Yy�S���&�{2^>�=i3�l����S�-���hm�Y�-)Y���wiaJ.�Z�������<���ugK�U�%������������-������%jg>�����}s0��l������S����������^�im����.�>o���"����p��g������]u��j��4�x��{�����u>���3G�y����{�$�c���,�
���������S�R3_6V�7����._�?y��a�Q�v��lF�k�"�[�7'���8�m���L�G��M7�s����BE�L/n��%"��z��2%Y�I�:P���N�-�������M�n���c�fgsM����]����]��g���#7s�a���e�:c,1�������)Z?����v�����Z�+���T.zbN�=������l�]�����~��.������������>�QF�u%0n
��O~�G]J�O�k��q�J���{��(���Et2|,m\=��3�J:��"8�����QD�����-4l�}�R�^P#�^��w	O������W��Kg��(��E4��tu�P**�K���������VP��P8G�]DBEER�/���i����4������&����/D���=;��-�C�O��4���(��zTW�PHh��C�;L��������29�W�d�{�IZ��d�~��s��#��;��C2L�zD��hL:�����-��09��&=������&���7��S���3���7���<D���1�\��Q V�u���/�������:��:4Tl�>�
�����?#N�Lj����}��;\��'���:������H�u�,�����W�:bQ��Ir�XSX~���?����;��aF��z}�W�8��#����$
W��(��mNW���q�IN#2�zq���������r u���KN����[bE���K^JUqR7����v�����<�*vdi��%�1m��#u��������;@��1V���1���|�3N�[Y�2nq���dz2�^��u^�l�0���Hs9�X�l����3���|�w��e���[������4�*�<�2�C�1���+?nv�qAZ���b����0�n���&�w��E��m��;�%C>���o�D����z{�K��*�od��\�V������R�?yb�X37�R�k��lo��4�bm���������m������^�*�\��:��[=5o�Q���L�8/I#�>u����q�`h���F��2�%��v����\���jw��������%/�6F+�\w���y�>�"�G�����R�e��]��4[������D����e�O��m�����t�N��p�����:�:z�=��D�L������[�xo�0�n���S�,���Ls����xC�%���w����w���T��jo����]��=�1��O�R��?�E�Pw�J���F�n+{t��8���"��=��[�0d�t��_�
�o��mz����"T_�xM�Q�����Z<�+��6�(W����Ys\�M{F�Mfzv�}}���4��g7���~�n������g��������?���]]gM��F�eP;T��uO-�����^f����_�����GJ���Y������>��������s��%�M�|��������u�=%���[lZ7K���O�<uJ�X���'��}&��+^�fMTc�0�.2���[�
_�y!�i����{Q�������F+9�wj�'c�J1}TO�s�{�.��Oy�6�v�x���r���_��lG�~Y��-fM���~p�
������:5z��'S��[������Y�U���n�:�/,�@���k���u|����=��i��D����A�n���/�	C��s�rV�L�ClZ�����*�:�����'��<��Y 
�Tt��"�����v�!��t��|��k��j��gg��,C��F>��S����Y�A�!zb������j���j���^����Y"iC�s�_ �EIC6Cd8
X/�AO]�
B���F^9f]u�������:�������k������
���>����'�SE�6�����e/�.�9��b�26��'�9"
�/g��1~�������J������-���md%^�r���G����U^O�lW,���t���I��y)`Y��"�����@�O�������I��:���B�b�������==Y���\�]�l'�E��2��P�RU��3{��!��V��3��;���*~��'h���4����6*��&����e�/����9�h������^V�Ew=/�}�=���o�7��-��a��q:[_�hJg�s�t�����=����/:
`�n����GL�n]�L�������0%��0Y���n�E������>h�;8������Z���\t��Xo?�l�|�|?7����u����G���w��)W�J[����-�tu�}>���]���l���6c�k��v�C���.�3��~I�Q�����K�r��
Zp=�>�9+i��,o�8!
N���H��<���"O,`;'������=�.g�?��6��H&��e/�:��{-�%��^+-�5-��_����l�X����Ng����4Z���"��7I��-���n�59�y]�*����Ceh���#;��Se] t�z�*���l�����(=�����;����#Z`������m2-M�*�����?���^.#�j��	n��M�4K:p����N;*�oO=����6���m������nc#��;��h$��}��� \����2s9&Di����nR���J������@J��Q���Z���xXv�����u�(���Eg"��Y����a�)w���������WV�Q�}��,(���.����[�X�������3B�s�e�����l�Y�����e����e��6S��r��I�o����������������WDk�����-V
�P6�����W�o�-�gV���
�} u��co�j��{�0�r�u�h�=���)��������9��-t����n�\2��+Ars�\}�l����?�oR��+�>��#�<p�C��g�Y:[�:���~�����0�&����^��s���������`�Jv�����c�CLc�-�#Dg�|2��.�(Yo������~�Ke�_=�����S"��WT_<Pg���N{i��x=c����v�����:���Lr�j���#�Q���>P��������M������\v]��R9���]���Rv��D��|�k�*(�OT��(Y����WZ}�,��<4C�w��yb6q1t���(;}����0�!S���l�FOo7��9���������*gC�|9a����t���yqN�b!g<��d`��?�����w�%NV2�2z"i�Ao=��s��������(h@wyh��Z�)<�D���,�)��Y����gz�����\��u���z�s���k^���Zzx�@��Q��n �w�����H���w��y�'�xO��R�|OsHc'�d�������=?/��N��.�g4
��7M�~}���9|�;U���n����f�zK�j�u�"�L+���rp<�Is(��i/�zP�����1������hgY'�����_�$���_S��@��j1�u���2V8m����@��u������}Z=_z�/�s��@���4���9���Q��0�2K�������D�-9x�s�f�)�b9�D��$���l<��9q�vD�@s �����<m�����a���:��5I��u�fCqi��d��#�R�S����$C��A�>C��=g9�K�������z��MG����GvB�����,S��)[^��Mw_A����
V�?��0(�jw���v�y��n��M�^TuL���i�wkq{�_)�p{�"����#����@����Os��u�d��T�Os&i�u���M�����X���U�e�����]s;@�\��3��u�]4vJ���?[im��y��W?���6���?������]N	�����m����D�'��,���<�Os����l��kvvi��L(�L�{�L��l3������7u�>���:gA�+Nc���7�����em���x ���]sn�7c�]�[.Vm��V	g���:�m����a�|�#��U�������r�X�d,c�����Mw��mA��[�m�?���m����b�j�l���1S��N�����SV�����w����]�>��uD�����K�sp��d��!������79��y�%��f���~�UZ��`�|u�tc�t�Y��8�Nb5������MA�(�����9�A:�M��$�F���M���
�� ��w5�����8E�� 78�+-x����i����c�o"-xm<�s��L��/�F�U�����^&��`����h1�:�5&>�6���
!B�R�����LK�>F{�F�Uu�i�������S����Z�����I\x�Ky��>���s}n��n���.�X}�< j5�/�<��D~��N�����z���b:$�a��1������~OsM���������34��W��^��c�SL6��3��-��m�4���T��pZ���
.�������
��\�tRk�-S����!��qY~�;o.���sD���0���x{���8�w���4���$j�V�|C��B�o��l�����-7�E�������[Yv5�����Q������L�������KsG��?>y���D�
�C'�O���6�i��|���{`.�t��=���b�D+&�VM��-#g�7������t5�y(�t��.������4j�8Z�-�����6����*6����)�6���O	o��w]k�(����u���K��6�.�K7SG7jiZ�����R�NJ���o���qE$������������}�;f����[b����� ��}���7��Q=(<�==�%�����i�!���k��~?��&X3���3�����~���|���<m
������tQSw���Fwh��/{�U~g�y?�6��������|J��K�/�����M�Y0��y
y����(��<�:���i��)q���m���5�}���8cX�Q%�Y\�3a1����S��|�0�3
{��[����4����/�w�;),Z��z�z���d�v3q��fl��w����o>�������O�1�fO��.����������'[�G��@~)��e�����!����=�g$����6�����oDw��et���)�7��8�7�<p,%dm�����MX�[h�����n���x�^:.haR-zb��I���
����6�^�Fk���pJ�.���=;���s�����b�������W����JW?���cm���}��X;+���M�[9�xu\����G!i���Y@m�����0{_u��|�-��[.���s�����Q��y���r�l(g���X�_l]	����D�:��_���oh+����DDm}>����\nN���
���7�G3�1w�����i#W������������~��������n��m�J�{��L<6�3�>��-�yf�8��}t��z]���?7��
���/$[�V4p�s�W�2����^J��'R���hEJ>]r�_�a�^������O&,�����%���c��Y%�)�r���8��=����#vu
���r�=�(��@���]������G������E_�<���mLkS}�pO������E��G�U�z�aOz�EA�6o�o��O�4g�Y��7�����?�/>��>5���uemy�z��F���>N=D�g}J�T����C����m�����{{��}V������asg�3��)���g���1�~�U�,����u�\���T�r�x(*:CW��uU��#�F��b����x�St}�V��U+n�[��q��k�<�)�Y���,�Qpx��#�����@5�����)�2������N�
^�H|�I9.�tIK�t�y
�~]:��-;������w��Q���o�����a#���F�uS�RQ�09��/��?7��]������{������4�V���2>�W�v������u��|����w�z����������/���P
�[��=������W���*e�G�����vYn�h��8�*�w��t�����CG����U��T����:�O;;A�$�J7~M#����1jtnN����y�0b������VX�!�)]
��_;�J�x|3l��QH�)C����&�r���~�9�pAT�����G��=�zJ�������]��t��8�~����=�[�Rz
f�������W5q(��^?�sZ����fZ���>���N�������3?����#�&����J�_��B�s{kt8���3�nY>������R-����CS�����������C��:AG��Y��&b�-��.��O�5	��%}�+���������
�W��&kq�tm
����C�R��i�@q���������|��]������<��|��J��?���:]O�5`�iz��`����x�no��?����m��Z��F�V�Q��n���s�N�	���jI;��`:w���?�����X=�z��=���$���w"��/�mp��D���=�G���=t1�#����Q��~����w]A��r��:��/����[o>3�)��:}Yt��r��'J�1�q@��noa��q{]z����v5���G�h
e�FM������<������#����z#A�����.��QT��M.�,�~h��3����:����{����*�3�����
ZS�����k��M_�%}��1�s�;���r-?/e�+�%��W`��T�DY'�S��f�4�CY~�9-���z�P���4�A�FI�;�C����)����ob�uZ�@�q��~�o��8�O;�����5��������I�Qg��CYu����'�����/�����kS��A���;��~�M?D�P�g�n���S
K�/�c��L��=s�.e�����'�g{��ho�w=���@�T�}<u�v���ZC�dl�����i�G�������Ls@��+g�����m<l������Y?B�L�����s
��'Cc��6"�N������P(����8��1�7'x�����>C�Tyyc��3R�2i�:LE��<p����4jv�����,�#�����0���~Z��izkZ
��o/�������67��t�6z��m2��W���Q��5�t��T0�_���X���4���n��lw���o����x�T/)��{��0Q���Mm`��Z�����e_�P��>�����o�����&m����l�u}�i�U�_�=ek�/{��|���k~���^�k
r��Wp�E�Qx�{�!��={�'��/
g�7�p��f��0m�-��q���O�G�(@����f��1���|�a��~��c �����6�_���,�/km�
��(����-�����*H����qb�<�������t��[.�����A��*���k����QW���5�o��t��G���o��z����v�i/:v����v�LXO�k���)[����;�c���_��!�����+�f��#��}�@�����[��|k��{�r�g���+���Rm��R��6F����C}{�I{�GY���Cgc=}y��#+�~ZO���P
���*.�8���t���4'�G�L� �m1���t��(;���\O��o��p��*�������p��uj?4�d�����=M'�4�����&8(���e���4���������g��.�����;�~�A�x���������?6��{����<J����O���4u18��%.�S�yN�����-��u;��_�:u������XMw�:��8��/�c��=�{
��I����A����6�Z^=���7����#�A����#��\Y��hO{>���q����������0����6��Ui4�V8]��;�G�r�d�����&m��~�A�"W�|e��i}�Lz�����{h��t����<`��=t����t�Q:y�R���h�tX�`����ZX�J�-i���w�(Z�.w��U4G�~@��v5����X�Cz��6:�������t����a��������v�Q�k��G;��7�t�ZCC�?���'���}95�p�0���5��[��	��Y=��A�Y���?���NWl%~Q���m*�9KP+���{un�K�Fy���j���G����??2�:S�����Bsx6�5���Ce���E
����'e��z���}<
����+�q�2);3��Wm49@e��������������R���=��A�J�b��X]�O�2�B{���R����V���WoQ��X�����k�|�4����`Zt]
Zw�pZ����-�b'����,��m������5�Z��'�DQJ�i��������u�r���5�E�����Q��_�ve���by�������;O�|g�+����>vT��/{�Ik�|�c�i;h�&~�/�m�rJ|�t����t\���|�Z��{H.i�����p
�����'�[�)�yu�i)_]v�&>���hi�G������ji�1����m3i�zN�(q}�{=M�L�s����&���~������t��F����=`;�}�p����m���d�C��?�.i���q>4A����D�����^�M����J���l��>Br��<��P���u��Cj���>��6�����2�Y�nL8�Q��x<���g�@~�>�{ ]]����PN��[�A������w����g��������T�����D|]���u��m@@3��8�g �M�v�:�����������X�f���nm�o�u����#��eeI��KO�Jg,H+;��������;��3T��tm�.F�/��g=\i��q}}�K�3��7�r	�����o��~y�����g���
D�g�`�y��;��^�����;��{����i���]:�>u����/�^�J�;[�O�������9�?>������Ol���]����`�Z"u������[�l��,�f�l��ss���dLl�d����Q�t����8���t��}��B���_�s���Z�@��N�q���4�C���%)$j���I��f~��I����diC���G�i�{����}�c��6�����/bG��Co�=h�����g1���F���hY�9PD�j0uh���v���+_"7��$���~���`��$��T����M=��xn��o��m&��z�����	��MyC�5U�+WdB�1����8�F��S�W������n�&��j�v�+�t>��]�u�cb��Pz�|��b�� _�5�_�����}�N�n�*�_t�������|�Onv��������������e�o�O�������Aw������$���_�)7�1m��LH�u���YW�
A��S����D���S��];����+���.�����|���D�Z}B���r���]F��L�~�fQkx�'=}������Y*�ss����	��w����^���F�^��4FrC����)���ul���m\O��]����6�^?���uI��U.J���h��s#w!�z�}�]o�s��go7�^T��������s 7�2���W�%QW���������+�'�>�8��Y���'j�yJ}��jM�Y�Gy?�*=N�h�.2GD����vg~�a��.�����n~�Aw��^�t�%D�6h�a���x�:���2����r�G����*��6����*��iq��tDT�z�����/{�����:!w�������R�u�hx�Y�g_3n����2������#������jr��6��}�*�3r��x����w�
�����&���j�*��0����x�G��7����#������e�i����e�����.Ci�bW�����S�:f�}��V���o��5�����Ms��e��~U�cdj_j��X��p��{3l�����L�F���6s��k&���:IV��OR:hm�?���o
��6��S+~�g2��������&���\��x���/�������5_mr@v��y����]��K����9��-�te)[o�5���/�������������G������,�d����E���mXIV�K^��S'W�U�.����W��g_�m��??�O��|��m	�u������|#�������[\a�\����V��P���
Ln��_�71O3��@��o�j�����
��k����e��o���u}:7&�7&��+�����z���2l�^t��G�.�������i�M"����7 87oTa����?m�B�\���!w[�)ga�Y!�$��NS���8]� R�0��y(]������u�Y'V�*]_������lk]�R�}�jW���-���EY��U�������h��L��^>��[��.,2���'��}��${�Cd���$q8��y�B���0�7��_�=��������G��Y2u&�'�1�
��a�Ex���=��;�C�����d���_�����s�-���*����������`��GO'?Yh%J�[�x�/�dZ���?]�}?��[y�<��#_������,���9?�3������k@<�S�gU������@��+���KV���bnkL��XI��y�A�4���=p�4�k����O9����v��A�Av_�f����[�������J�EZ�))�ni2J�Y�@_\,��ye�C�w�^�7��S��:�=/��{����fgsh���|�z�~*�Pe���J^��[7�eo���������}
�\��������^9��G���z�k:^cz#8n�J5�3��k���;��c�����b	$=��jVT~�HE$��3%��������h-�+���Z���QY��%y9��-�x�\��OG���O7/)�������F�����nq������'����`_y���+�]D�9VL5�/"~I	]�}4�}K����=�E��yx�7����q��q�w�$W}���x���z�4|_	L�=}��u��j��W��/k%��P��5j��`��@d��P r���u��=#��~�����;�zd%.�'y��m�L��/gxZ��a�M�S�_~��'��{{��N��L����e����(��y�����<�-���:
$���b�Q6����_yex^\��������(�Y)M������1f��K�wp�X:�ez�S�l��S6��������}
�*B���t���*�YPE�	5�����!�V$��Y�Z�����<��9�:�?=�KW�F�V��c����Fy�V�3����S��(��v����t����yO�@��T��	��A_�X*� p�H[��z>��#�y�J���c�gQ!��nJ�Q�GUiJm���PU�QUX�3� �u��">�8��@U�;��\�k]�V=zx�.��e���# 7������w�A���My�������P� 6��)�����J]�N3i���t��]L�y$��  `"��O�=�(�����:�����dO�C
���
���&�e�R��A��p�n��xTEV|p
@��	T��)���=�1��F�"       PnJ��b�iA       U��U�8�������T>8*��������T)pT���2       P��,��2�       P��YP����������@�������������@�"gA�*(      �O���/h      U��U�8�������T>8*��������T)pT���2       P��,��2�       P��YP����������@�������������@�"gA�*(      �O���/h      U��U�8�������T>����P���t��,r����3��       P�	���(�3���3���J���R)`nP�
��85�P.@9�����f����Uj��/�4��"j�K��[�	j�eqqt�17���X#��9�.����
��������9�y��,�=�y�yQ����Z�N�qG�m
��b8 B�&�IDAT      �R"P"c���B�;+�������K$@$@$@$@$@$@$p�	�h��_���_����'      (E%2����6��JQ]�&      ��Jd,���0}       ��'@cA��5       �2E���2UT�H�H�H�H�H�H�J���_��H�H�H�H�H�H���TqP      (}4�~P      (Sh,(S�AeH�H�H�H�H�H�H��	�XP�e@
H�H�H�H�H�H�H�L���L�!      ��'@cA��5       �2E���2UT�H�H�H�H�H�H�J���_��H�H�H�H�H�H���TqP      (}���
��~L���]�L���{������8l��x����Z���6,o��
!���6��#h����\��$YV�=������b��8��FY%K�H�H�H�H����`7j�h���5|-����o=�FdaJ�y�P���#�������!8�<��9����yiLA*L�C��vE�����p��a�Y�o�J�P��5��c���X���J���
o�\�|o�>r6��i%^^�/��frc��3�k?c{�XS*����X�xs*��x���\�������bLD �������R`�{�,3F�+��`H�;E�G������7*"W������G�}�|X�E�L2a��TT�42�j�1r�{����5���t�rh����������mr'�+���F�`�+o
��q�	���e&/��i�Q���?/-s�{�����s��������#;[��s�Y��3L�=W���#�����?	gf�k�p�a]������������}���=WJ[���~��}��8�����r��V8BMI���E=�i1I�����?�f��Re�����������M��J�Le4?���"i�����:�L@pw��L��yt3&�6�0r�k�r8u�6-��y��R���Q����N��s�qa5D���y@q�KG�I��q�1vu�������yl�N����o����y�0��Ll�,M�P.��H2.e��7�Xp=m��H�~+�W�
\���Ka�	\�2�)���g�o��M�����5hxk����������6����)�H�H�H�h�lB��v�K�����s����m^����e�p��lY
��q�|#�l�JqO`�+]�J�%�jA�9m��z���^H��l��c��`U�����n���
�;�I��d�{r�jIY������#~��5���	���i�_U���:����ZNp��.�@�����<O�O�m��b�P�R��Fi�jb�PG�*����
�8�E���	����������<�U�l��S�k����e_4^i��I�3��7a���a����_���������8�6:�^��������x-`#6�|��5|Q�>���m��;�lV�CA�$�"{p=���2�4/�p��Ff��j�d��������o%oE�6{m)?�����Iy[��i�'��Q��
xUQO��
��e�f�y M:�F��1�d�n,�SY��_���b����K�H����F-�oxT-1����q������~��Q��)���_�X�R���x�����o<6���1�b��$N��c��D�`��o��W�������(��z;���!��%Rq��,��q`���t������$ ��%
O�w���~�h,��:����	�p��_9\�q��a�{>[������v<�A����nV�����8��U�<��5��W'�H=��^��^cSP�F���S`4   ��E�����ab�����Z?������L�=�*|}�-�e��
t�?
}����>�G������8��a�`��m�L�xm�<�����q��'L�����bh/���~O|
p�K��H��3�9�����;x"�3�'6����$��p��9#F������Y�Zz`��X����=0M���9T��q)F�����-^a�'�o�L��8��!��[tZ�P?�X�v�?��L��#Y���N.����2���������QZ��O�#'��>��0	�zI���>�y����EC;#,�-�mM��pf6{�C��3�J��t�Y���#0n�0K�?�����W
z���q�G������CD�nhS������������T{��3=A�x����>/��5:�����"���#��m�	���flI4;���O�z(]�_�Kqz�l�?[��������Z���<����E����Tn7G���h_�"C����u$�����VOTkTF�0����9���3&�� ~B�����uX������3Uf��������e��N�&�����uh/�hL^s��@td7��12�.e1�Sn{6'�G�	����L|�(�&�� ��S������Wza����|�0���/��90�[���]\�BN:><m`V�%�������^�Z
W��EhY����~�d��OD�p� M����WU�Z��z�n��Z���#�t]IUw�6�:o5�����*�C��U�����C�x�+�[�U�W1���>��BK���`P��U=m;���.�.�����n�����a�2���
*Fo�5NH��j_z�ZX�>�����JG3����2Uqz�����{o�I8Z�6���|t����
�|�Z�������S��3Q��p`�n�����j��/>���m���.���X{+�{��U[��.=��i�m6�3��<���;��]W��e}���u���M/������w�����-Op��6�A$@$@$p#x0��~1�?X�����wv�}f�j�vol��_E����XfO�I���z���r�������<������uK��mEPMB��#1z���|Y�t���t$���/�j�g}����x�
�G����|��uO
��
C����m�k��/r�)����"��M"������I��n�I]pb"�C�O�H�o��Tk�����t"��l��^!�]��������H�G�m���\�LW���xn����S��/�������x�e�1C����-
�O�.^];R����9X0�5m����F������Isp�r��mj�:,�����L��%��Ymp��<��;��~�t����>E���r�����$�'/��"�������I�Wxvbv�����sP������CZ�2����\����S�����h���.
��j��a�H��_v���L�6:7O��n1��C}R���=Y����<���$1	Y�P����2����B	�����&bp93W��zW�8T�q{9e?v��_�>��N���S���t��f����������N3��:p���u��\8�����d���E�H>���L�p��p�3����}A!��]����_h3�E�a����#���q�U�b��Q�zx;��o����<��! ��kx��G+t��TKh9x&�3gp�9hQ����n�������&���at�	j=�������}���C���#xu������Q�)FMu�����P<]���2���������������S��0f�y�X��Rwc���J�E�����F,��(N��!�0�x�kc����6��9��2T��->O.�$�.�x(����`�� �>��5k0E~kN�����"��o���B0:�
�����b���1��O��Z�c�����6��m���}��5<SR�*���[�S�1z�h|��@�������jb��M�������I�q&���kj~�n������
,��5S���xc�jD���xO�g��;	�	�	�@�	���K��W���W�:����n:Zg����~&7��D����
y��+N�:+��������T��� ���_3�m^��^��9Yo\!�=��`�-�C���O<�����|U��0�ws8�����n����e��m��x�{��=�c���H���b�(_��zy�����/�Q���(=���c���>Jtx�!���yW���eZ�����������11l1��}���#A����%�����g����:��i��L�Vz��.�k3����J�>$^��;���0g@s���mf���,���	�g����G���3Q�fl�t{���s�s�z9����W,�%�p��5��`���2r>N'/�Ay46%09F���h���o��n��c:��xP����L�%F��b�p���E����K1��t���>����b8�9�z�p�fc�2����\Y�`;|}� !{4x$f=S�/'�b���u�)QU=�h,K�GN�-�d4������q"Nn������`=K��|�e`\���`��_���/5q�[]���WM�8w:u�
}����g:���G�:�������n���������������O�-�i;������0���a�<����B^�|��������(�����x�\x���w�GG�i#�V�$N��s��/����Kn���!����88y�g�4��;:�{���oE�hQ�����>G�a���]���x��}�����,��Y����6-B�a�P���+E����=g b��h�4~~x��P��f�����)b��xi�.���3�}�yA���\)��PT_}���Y�5�����&`���!f�r�N���i�.�GF�7Z�x�.|O6~����_#/��������*I�p�G^&  ���-1(U�Leq�'53�;&;�D�Y�P;���l��s�`��,��xT�����{n��
��a< �����I�'G��gz��g�����/2Qm;��g�l������2�����0�����S��C�o
��e�J'�pwq�&4�Z�c��I��]S������Cf��ow���&`��-/�����Q��
�
����+>�2d������>|��Hv�{���3�,�����\��6b���PK\���ab�
������C�5G��!�t'S,3^��"�"/{����)uE���,������O��A��Am��:��K���X�e5�n_��~){c��������z��n�����+��G�c��#�g"{�`����6�)U�G�j{��X�N_7��'�/�k����DUz'�zp�_����f#i<�x�g�J��?>
?f����$���������,����E?O��b0����n���G�
�D�\�T�S��L����1j �~��|T0��	����3�s��A���\zRx��2��:reW��F�[���"��}/u���XL��'^�I�^M��b�������I;�S���'kZ�-��
�$�ogg[���.u/���89�����!�����Q m���O\-��1<��8:X�D����Q���?��Gu������r�S�JF�f��Iz���f&���n�!��1����Md������X�Ose-o��5?]9��?�g����,��A$@$@$pk�y�����4��1k-������2-���!�Q�������n����D���j ���8g����'�>�d5v�5����0A\4�aR
�L��}�W���Y%w����u���a^��U�<h�g�	k�e@�C���=r�/~!����7����������r��w.m��oD��q�D�e'��=#"7����,�6t��===��e|��e�e��y�y��M����;�zc��H����[�"?}
��a�.�����G=�����3�]x�)�z�2\8��/�J�z�_��!z�L��i�v�7����tS�
8�����������Q��7,O�UK5]1�L��,1�����O���:<�V1��U����m���N�j���K!����"Y���� �K��3C�h��0.����&b�����[���*e��7��6����������sj�����m��p{X�'b�x�?�/o��d��f"��=M\�\�����P���8j� ^A���*�1d�|�FA�|��r���j����9������
i���
����&�������=� �<��FqWL\����m��>�k;����vh�ePi�'�fs��c���F�o������C[�q�e��K^E/|b��P)��c��<�+P����9�9�2������e�����!��Sw}���8�t�c1�*�{?a�3�f�t��sH�����Wu����
�!��6p{�?i.�{�@�QI��lN�Mjz���1��^lQ������o�����i��b�1	�	�	��u�%��
���M�L-q�����'�`� m*���Aj�0G>������&k����q����'GvN&��{�>H;$i���,k�����]����y�q���*cK��z

D��~z�j��y�
���d/�����^���uH?�fX'�s���m��
����w��*�����q��2W��%s��=�:��l�	�Kf��?O�ONM����k������$K~���(�$�[���h����7���Hj�'��l��\�����?��^����p��p�+��~���&=)��IYd�l�������1C��~��3]~�l�."��T����:

������'|�#����N1�h���r��el9�7�
����g�$�N�k��?���z}�"��v�����i��&��
��x��N����^�d���)��E'yx�'�Dm���_7�/{���{�Y�f�z*��+����.�U����F&:4��6�
���4q�O?a��7��x�c.8$*�L_�������i>LU�X�a�����������\�*K]T���N���I��f;;�.d���e��ZWk�H��=���I��vzs�6(���*m3�!N�����ku~T]�b��ZM���fg�eY���"4zr��.�e�{����Kl�(�}�A�*��/�������q1:
y#��(�/�������}:��q:��iy��!Y� +�}��:p��������
�7
M��GO)�WANFj�>G���8����L�N9��O����d)���myZ�i;��;��-�����.�,���{���������7��������A�t����^yUg���Q.�;�"����%-���z'A��F�K�if���z��'}���-������T���~����k�o��)����oBqr*���L&���k?�,}N�G�M%����l1��IkSg���si����6�(
��<e�����
�����m�Z�h�5���a:�K�m��^aX�p&Y6�+-�Ga~[��>�.T6�����*�����t����i	���yF$@$@$ps%8�HN5�s�\�_�wF��������z�6��r��[�����Q����L-����s
��N��1F���K�D�+�F������}�c�Bi�3_�2���������x��#���p�[���
^��+���W�����2=v�S�?h�0i�q<�R������b��9��k�	#����4+���i[�W�5�;��W����Ct�i�����M?�"e8������� T��?��O�F_�iT
����U�MV�i���9��}�g\�/������<�{"��4��lc���{�����d�O<hW������R��L����u���������Ec���}U�-�bdxZ6"K���C^T]Yl>o/��uI������t��4���MO�����|���-��"���e��*���Qi4���D����������nA}�k_��1/�:��,������j�������>4�\���l�^{�*����O��X������sU�{�%�J�#<��d��yP~��E}�d������<F��C_P}�Tk}+X~��cQm�8=m���Mu�����`���Ya�d�~��Nv��%���}I���]N����	�X�[��]������&�\��d�S���c������@���S}��a�S��U�N�f���I�>S�-I��n�!}�b]m��vclI��HW��'��o�����<+>��X�G~c�:��-u�s���8���t��-��q����}����dA����7�����S3��G�=��4��p�
��/���bv�u^-;�,������/��3�]�j�����>��%��^
��?�����h�!yu};?��f_�P|���g?����@f��������LP�'Gff3}d_���f���<���)��K�E��)oOP3�~3�%KD<T~��/a!���s(��:{Q�#�on��<�.z���.�6�p�4��%6�����/�+)����XD��H!��}Y:�wv[�EF��_�����w���7'[qW��B�d^�+.�%����������E���o
�"	�[��j��z�g���`��#(�'�����1vwK�:�����.��	�	�	���% ����Cn��c�-����$�������h�r�<����'k=S�:���"u.�f������C\��H��$$����J����o���Qx��"���������w�����'�,�TuV��P�I�i�0@�)���'��N������LMD����M��]��O=(����6�+�*����x]�^�9�&!�]�z�O�������I�S����j���,�� *V/[+�o4��O���>���y�z����a�Pw��������^��$f���T�%�	�	�	��� p��7#L����#�������Z��7~�F���j,P�����z�B-7DN�G��z����	��$Pb5#phm$�g���xn,���0w�|����uk�|��^��gU aJ+�Yp���\g����]�r=E\	vq-u� �!6��'����]xw��.b��M%p��G�����������������������&�@��g��
P]�d��-'�`�%m����
���	�	���6�[���7s3	��jj�1��SY$�,X�����Y����i�>���9�F�n��Y?e���j�����1�h�[�!��z}�	��7����A�i�	�	��5�����#      ���%�N,�Z�A$@$@$@$@$@$@$PZh,(-��K$@$@$@$@$@$@e��e�`�	�	�	�	�	�	�	�J�<��	�	�	�	�	�	�@%@cA-�E$@$@$@$@$@$@�E����"O�$@$@$@$@$@$@$PF	�XPF�j�	�	�	�	�	�	�@i�����S.	�	�	�	�	�	�	�Q4����Z$@$@$@$@$@$@$PZh,(-��K$@$@$@$@$@$@e��e�`�	�	�	�	�	�	�	�J�<��	�	�	�	�	�	�@%P"c�����]-�Y�Z$@$@$@$@$@$@$@7�@��+���Y7B.�       (�Jd,�{.\�	�����~�O��"      �&C��$���o8q�,�s����KJ��aI�H�H�H�H�H�H��@�����C��#	�	�	�	�	�	�	���(�2�k��$@$@$@$@$@$@$@�4�^J�z�	�	�	�	�	�	��-"@c�-M1$@$@$@$@$@$@$�{!@c�����'	�	�	�	�	�	�	�"4�"�C$@$@$@$@$@$@����2$�g�IEND�B`�
#170Bruce Momjian
bruce@momjian.us
In reply to: David G. Johnston (#168)
Re: Document "59.2. Built-in Operator Classes" have a clerical error?

On Sun, Aug 2, 2020 at 08:43:53PM -0700, David G. Johnston wrote:

On Sun, Aug 2, 2020 at 8:17 PM osdba <mailtch@163.com> wrote:

hi all:

In Document "Table 59-1. Built-in GiST Operator Classes":

"range_ops any range type && &> &< >> << <@ -|- = @> @>", exist double "@>
",
 
Should be "<@ @>" ?

It helps to reference the current version of the page (or provide a url link)
as that section seems to have migrated to Chapter 64 - though it is unchanged
even on the main development branch.

The table itself is extremely difficult to read: it would be more easily
readable if the font was monospaced, but its not.

I'm reasonably confident that the equal sign is part of the second-to-last
operator while the lone @> is the final operator.  Mostly I say this because
GiST doesn't do straight equality so a lone equal operator isn't valid.

I dug into this. This query I think explains why the duplicate is
there:

SELECT oprname, oprleft::regtype, oprright::regtype, oprresult::regtype
FROM pg_am
JOIN pg_opclass ON opcmethod = pg_am.oid
JOIN pg_amop ON opcfamily = pg_amop.amopfamily
JOIN pg_operator ON amopopr = pg_operator.oid
WHERE amname = 'gist'
AND opcname = 'range_ops'
ORDER BY 1

oprname | oprleft | oprright | oprresult
---------+----------+------------+-----------
&& | anyrange | anyrange | boolean
&< | anyrange | anyrange | boolean
&> | anyrange | anyrange | boolean
-|- | anyrange | anyrange | boolean
<< | anyrange | anyrange | boolean
<@ | anyrange | anyrange | boolean
= | anyrange | anyrange | boolean

| anyrange | anyrange | boolean

--> @> | anyrange | anyrange | boolean
--> @> | anyrange | anyelement | boolean

Notice that @> appears twice. (I am not sure why @> appears twice in
the SQL output, while <@ appears only once.) The PG docs explain the
duplicate:

https://www.postgresql.org/docs/12/functions-range.html

@> contains range int4range(2,4) @> int4range(2,3) t
@> contains element '[2011-01-01,2011-03-01)'::tsrange @> '2011-01-10'::timestamp t
<@ range is contained by int4range(2,4) <@ int4range(1,7) t
<@ element is contained by 42 <@ int4range(1,7) f

There is an anyrange/anyrange version, and an anyrange/anyelement
version of @> and <@. Anyway, for the docs, I think we can either
remove the duplicate entry, or modify it to clarify one is for
anyrange/anyrange and another is for anyrange/anyelement. I suggest the
first option.

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

#171Bruce Momjian
bruce@momjian.us
In reply to: Bruce Momjian (#170)
Re: Document "59.2. Built-in Operator Classes" have a clerical error?

Ah, seems Tom has even more detail so we will continue to discuss on
that thread.

---------------------------------------------------------------------------

On Mon, Aug 3, 2020 at 07:51:51PM -0400, Bruce Momjian wrote:

On Sun, Aug 2, 2020 at 08:43:53PM -0700, David G. Johnston wrote:

On Sun, Aug 2, 2020 at 8:17 PM osdba <mailtch@163.com> wrote:

hi all:

In Document "Table 59-1. Built-in GiST Operator Classes":

"range_ops any range type && &> &< >> << <@ -|- = @> @>", exist double "@>
",
 
Should be "<@ @>" ?

It helps to reference the current version of the page (or provide a url link)
as that section seems to have migrated to Chapter 64 - though it is unchanged
even on the main development branch.

The table itself is extremely difficult to read: it would be more easily
readable if the font was monospaced, but its not.

I'm reasonably confident that the equal sign is part of the second-to-last
operator while the lone @> is the final operator.  Mostly I say this because
GiST doesn't do straight equality so a lone equal operator isn't valid.

I dug into this. This query I think explains why the duplicate is
there:

SELECT oprname, oprleft::regtype, oprright::regtype, oprresult::regtype
FROM pg_am
JOIN pg_opclass ON opcmethod = pg_am.oid
JOIN pg_amop ON opcfamily = pg_amop.amopfamily
JOIN pg_operator ON amopopr = pg_operator.oid
WHERE amname = 'gist'
AND opcname = 'range_ops'
ORDER BY 1

oprname | oprleft | oprright | oprresult
---------+----------+------------+-----------
&& | anyrange | anyrange | boolean
&< | anyrange | anyrange | boolean
&> | anyrange | anyrange | boolean
-|- | anyrange | anyrange | boolean
<< | anyrange | anyrange | boolean
<@ | anyrange | anyrange | boolean
= | anyrange | anyrange | boolean

| anyrange | anyrange | boolean

--> @> | anyrange | anyrange | boolean
--> @> | anyrange | anyelement | boolean

Notice that @> appears twice. (I am not sure why @> appears twice in
the SQL output, while <@ appears only once.) The PG docs explain the
duplicate:

https://www.postgresql.org/docs/12/functions-range.html

@> contains range int4range(2,4) @> int4range(2,3) t
@> contains element '[2011-01-01,2011-03-01)'::tsrange @> '2011-01-10'::timestamp t
<@ range is contained by int4range(2,4) <@ int4range(1,7) t
<@ element is contained by 42 <@ int4range(1,7) f

There is an anyrange/anyrange version, and an anyrange/anyelement
version of @> and <@. Anyway, for the docs, I think we can either
remove the duplicate entry, or modify it to clarify one is for
anyrange/anyrange and another is for anyrange/anyelement. I suggest the
first option.

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee