Minor optimisation of XLogInsert()
Patch adds cacheline padding around parts of XLogCtl.
Idea from way back, but never got performance tested in a situation
where WALInsertLock was the bottleneck.
So this is speculative, in need of testing to investigate value.
--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Attachments:
XLogInsert_cacheline_opt.v3.patchapplication/octet-stream; name=XLogInsert_cacheline_opt.v3.patchDownload
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 352,358 **** typedef struct XLogCtlInsert
XLogPageHeader currpage; /* points to header of block in cache */
char *currpos; /* current insertion point in cache */
XLogRecPtr RedoRecPtr; /* current redo point for insertions */
! bool forcePageWrites; /* forcing full-page writes for PITR? */
/*
* exclusiveBackup is true if a backup started with pg_start_backup() is
--- 352,379 ----
XLogPageHeader currpage; /* points to header of block in cache */
char *currpos; /* current insertion point in cache */
XLogRecPtr RedoRecPtr; /* current redo point for insertions */
! } XLogCtlInsert;
!
! /*
! * Shared state data for XLogWrite/XLogFlush.
! */
! typedef struct XLogCtlWrite
! {
! XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
! int curridx; /* cache index of next block to write */
! pg_time_t lastSegSwitchTime; /* time of last xlog segment switch */
! } XLogCtlWrite;
!
! /*
! * Shared state data for backup details.
! * Protected by WALInsertLock but frequently accessed without lock.
! */
! typedef struct XLogCtlBackup
! {
! /*
! * Defines whether we are currently running a base backup.
! */
! bool forcePageWrites;
/*
* exclusiveBackup is true if a backup started with pg_start_backup() is
***************
*** 364,380 **** typedef struct XLogCtlInsert
bool exclusiveBackup;
int nonExclusiveBackups;
XLogRecPtr lastBackupStart;
! } XLogCtlInsert;
/*
! * Shared state data for XLogWrite/XLogFlush.
*/
! typedef struct XLogCtlWrite
! {
! XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
! int curridx; /* cache index of next block to write */
! pg_time_t lastSegSwitchTime; /* time of last xlog segment switch */
! } XLogCtlWrite;
/*
* Total shared-memory state for XLOG.
--- 385,397 ----
bool exclusiveBackup;
int nonExclusiveBackups;
XLogRecPtr lastBackupStart;
! } XLogCtlBackup;
/*
! * Set cacheline size big enough for heavyweight iron, since a lower setting
! * only saves a few bytes.
*/
! #define CACHELINE_SIZE 256
/*
* Total shared-memory state for XLOG.
***************
*** 384,401 **** typedef struct XLogCtlData
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
! /* Protected by info_lck: */
! XLogwrtRqst LogwrtRqst;
! XLogwrtResult LogwrtResult;
! uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
! TransactionId ckptXid;
! XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
! uint32 lastRemovedLog; /* latest removed/recycled XLOG segment */
! uint32 lastRemovedSeg;
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. Permission to read/write the pages
--- 401,417 ----
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
! char WALInsertLock_padding[CACHELINE_SIZE - sizeof(XLogCtlInsert)];
!
! XLogCtlBackup Backup;
!
! char backup_padding[CACHELINE_SIZE - sizeof(XLogCtlBackup)];
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
+ char WALWriteLock_padding[CACHELINE_SIZE - sizeof(XLogCtlWrite)];
+
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. Permission to read/write the pages
***************
*** 407,412 **** typedef struct XLogCtlData
--- 423,439 ----
TimeLineID ThisTimeLineID;
TimeLineID RecoveryTargetTLI;
+ char info_lck_padding[CACHELINE_SIZE];
+
+ /* Protected by info_lck: */
+ XLogwrtRqst LogwrtRqst;
+ XLogwrtResult LogwrtResult;
+ uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
+ TransactionId ckptXid;
+ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
+ uint32 lastRemovedLog; /* latest removed/recycled XLOG segment */
+ uint32 lastRemovedSeg;
+
/*
* archiveCleanupCommand is read from recovery.conf but needs to be in
* shared memory so that the checkpointer process can access it.
***************
*** 750,758 **** begin:;
* Decide if we need to do full-page writes in this XLOG record: true if
* full_page_writes is on or we have a PITR request for it. Since we
* don't yet have the insert lock, forcePageWrites could change under us,
! * but we'll recheck it once we have the lock.
*/
! doPageWrites = fullPageWrites || Insert->forcePageWrites;
INIT_CRC32(rdata_crc);
len = 0;
--- 777,786 ----
* Decide if we need to do full-page writes in this XLOG record: true if
* full_page_writes is on or we have a PITR request for it. Since we
* don't yet have the insert lock, forcePageWrites could change under us,
! * but we'll recheck it once we have the lock. Be careful not to touch
! * the cachelines being used by processes holding WALInsertLock.
*/
! doPageWrites = fullPageWrites || XLogCtl->Backup.forcePageWrites;
INIT_CRC32(rdata_crc);
len = 0;
***************
*** 898,904 **** begin:;
* just turned off, we could recompute the record without full pages, but
* we choose not to bother.)
*/
! if (Insert->forcePageWrites && !doPageWrites)
{
/* Oops, must redo it with full-page data */
LWLockRelease(WALInsertLock);
--- 926,932 ----
* just turned off, we could recompute the record without full pages, but
* we choose not to bother.)
*/
! if (XLogCtl->Backup.forcePageWrites && !doPageWrites)
{
/* Oops, must redo it with full-page data */
LWLockRelease(WALInsertLock);
***************
*** 974,980 **** begin:;
* defining it like this leaves the info bit free for some potential other
* use in records without any backup blocks.
*/
! if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
info |= XLR_BKP_REMOVABLE;
/*
--- 1002,1008 ----
* defining it like this leaves the info bit free for some potential other
* use in records without any backup blocks.
*/
! if ((info & XLR_BKP_BLOCK_MASK) && !XLogCtl->Backup.forcePageWrites)
info |= XLR_BKP_REMOVABLE;
/*
***************
*** 8018,8024 **** CreateRestartPoint(int flags)
}
/*
! * Update the shared RedoRecPtr so that the startup process can calculate
* the number of segments replayed since last restartpoint, and request a
* restartpoint if it exceeds checkpoint_segments.
*
--- 8046,8052 ----
}
/*
! * Update both shared RedoRecPtr so that the startup process can calculate
* the number of segments replayed since last restartpoint, and request a
* restartpoint if it exceeds checkpoint_segments.
*
***************
*** 8853,8859 **** do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
{
! if (XLogCtl->Insert.exclusiveBackup)
{
LWLockRelease(WALInsertLock);
ereport(ERROR,
--- 8881,8887 ----
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
{
! if (XLogCtl->Backup.exclusiveBackup)
{
LWLockRelease(WALInsertLock);
ereport(ERROR,
***************
*** 8861,8871 **** do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
errmsg("a backup is already in progress"),
errhint("Run pg_stop_backup() and try again.")));
}
! XLogCtl->Insert.exclusiveBackup = true;
}
else
! XLogCtl->Insert.nonExclusiveBackups++;
! XLogCtl->Insert.forcePageWrites = true;
LWLockRelease(WALInsertLock);
/* Ensure we release forcePageWrites if fail below */
--- 8889,8899 ----
errmsg("a backup is already in progress"),
errhint("Run pg_stop_backup() and try again.")));
}
! XLogCtl->Backup.exclusiveBackup = true;
}
else
! XLogCtl->Backup.nonExclusiveBackups++;
! XLogCtl->Backup.forcePageWrites = true;
LWLockRelease(WALInsertLock);
/* Ensure we release forcePageWrites if fail below */
***************
*** 8910,8918 **** do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
* either because only few buffers have been dirtied yet.
*/
LWLockAcquire(WALInsertLock, LW_SHARED);
! if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
{
! XLogCtl->Insert.lastBackupStart = startpoint;
gotUniqueStartpoint = true;
}
LWLockRelease(WALInsertLock);
--- 8938,8946 ----
* either because only few buffers have been dirtied yet.
*/
LWLockAcquire(WALInsertLock, LW_SHARED);
! if (XLByteLT(XLogCtl->Backup.lastBackupStart, startpoint))
{
! XLogCtl->Backup.lastBackupStart = startpoint;
gotUniqueStartpoint = true;
}
LWLockRelease(WALInsertLock);
***************
*** 9003,9021 **** pg_start_backup_callback(int code, Datum arg)
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
{
! Assert(XLogCtl->Insert.exclusiveBackup);
! XLogCtl->Insert.exclusiveBackup = false;
}
else
{
! Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
! XLogCtl->Insert.nonExclusiveBackups--;
}
! if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
! XLogCtl->Insert.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
}
--- 9031,9049 ----
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
{
! Assert(XLogCtl->Backup.exclusiveBackup);
! XLogCtl->Backup.exclusiveBackup = false;
}
else
{
! Assert(XLogCtl->Backup.nonExclusiveBackups > 0);
! XLogCtl->Backup.nonExclusiveBackups--;
}
! if (!XLogCtl->Backup.exclusiveBackup &&
! XLogCtl->Backup.nonExclusiveBackups == 0)
{
! XLogCtl->Backup.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
}
***************
*** 9073,9079 **** do_pg_stop_backup(char *labelfile, bool waitforarchive)
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
! XLogCtl->Insert.exclusiveBackup = false;
else
{
/*
--- 9101,9107 ----
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
if (exclusive)
! XLogCtl->Backup.exclusiveBackup = false;
else
{
/*
***************
*** 9082,9095 **** do_pg_stop_backup(char *labelfile, bool waitforarchive)
* backups, it is expected that each do_pg_start_backup() call is
* matched by exactly one do_pg_stop_backup() call.
*/
! Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
! XLogCtl->Insert.nonExclusiveBackups--;
}
! if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
! XLogCtl->Insert.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
--- 9110,9123 ----
* backups, it is expected that each do_pg_start_backup() call is
* matched by exactly one do_pg_stop_backup() call.
*/
! Assert(XLogCtl->Backup.nonExclusiveBackups > 0);
! XLogCtl->Backup.nonExclusiveBackups--;
}
! if (!XLogCtl->Backup.exclusiveBackup &&
! XLogCtl->Backup.nonExclusiveBackups == 0)
{
! XLogCtl->Backup.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
***************
*** 9294,9306 **** void
do_pg_abort_backup(void)
{
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
! Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
! XLogCtl->Insert.nonExclusiveBackups--;
! if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
! XLogCtl->Insert.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
}
--- 9322,9334 ----
do_pg_abort_backup(void)
{
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
! Assert(XLogCtl->Backup.nonExclusiveBackups > 0);
! XLogCtl->Backup.nonExclusiveBackups--;
! if (!XLogCtl->Backup.exclusiveBackup &&
! XLogCtl->Backup.nonExclusiveBackups == 0)
{
! XLogCtl->Backup.forcePageWrites = false;
}
LWLockRelease(WALInsertLock);
}
On Tue, Nov 15, 2011 at 6:24 AM, Simon Riggs <simon@2ndquadrant.com> wrote:
Patch adds cacheline padding around parts of XLogCtl.
Idea from way back, but never got performance tested in a situation
where WALInsertLock was the bottleneck.So this is speculative, in need of testing to investigate value.
I kicked off a round of benchmarking around this. I'll post results
in a few hours when the run finishes.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Wed, Nov 16, 2011 at 9:33 AM, Robert Haas <robertmhaas@gmail.com> wrote:
On Tue, Nov 15, 2011 at 6:24 AM, Simon Riggs <simon@2ndquadrant.com> wrote:
Patch adds cacheline padding around parts of XLogCtl.
Idea from way back, but never got performance tested in a situation
where WALInsertLock was the bottleneck.So this is speculative, in need of testing to investigate value.
I kicked off a round of benchmarking around this. I'll post results
in a few hours when the run finishes.
I thought that the best way to see any possible benefit of this patch
would be to use permanent tables (since unlogged tables won't
generated enough XLogInsert traffic to throttle the system) and lots
of clients. So I ran tests at 32 clients and at 80 clients. There
appeared to be a very small speedup.
resultswp.master.32:tps = 10275.238580 (including connections establishing)
resultswp.master.32:tps = 10231.634195 (including connections establishing)
resultswp.master.32:tps = 10220.907852 (including connections establishing)
resultswp.master.32:tps = 10115.534399 (including connections establishing)
resultswp.master.32:tps = 10048.268430 (including connections establishing)
resultswp.xloginsert-padding.32:tps = 10310.046371 (including
connections establishing)
resultswp.xloginsert-padding.32:tps = 10269.839056 (including
connections establishing)
resultswp.xloginsert-padding.32:tps = 10259.268030 (including
connections establishing)
resultswp.xloginsert-padding.32:tps = 10242.861923 (including
connections establishing)
resultswp.xloginsert-padding.32:tps = 10167.947496 (including
connections establishing)
Taking the median of those five results, the patch seems to have sped
things up by 0.3%. At 80 clients:
resultswp.master.80:tps = 7540.388586 (including connections establishing)
resultswp.master.80:tps = 7502.803369 (including connections establishing)
resultswp.master.80:tps = 7469.338925 (including connections establishing)
resultswp.master.80:tps = 7505.377256 (including connections establishing)
resultswp.master.80:tps = 7509.402704 (including connections establishing)
resultswp.xloginsert-padding.80:tps = 7541.305973 (including
connections establishing)
resultswp.xloginsert-padding.80:tps = 7568.942936 (including
connections establishing)
resultswp.xloginsert-padding.80:tps = 7533.128618 (including
connections establishing)
resultswp.xloginsert-padding.80:tps = 7558.258839 (including
connections establishing)
resultswp.xloginsert-padding.80:tps = 7562.585635 (including
connections establishing)
Again taking the median of the five results, that's about an 0.7% speedup.
I also tried this over top of my flexlock patch. I figured that the
benefit ought to be greater there, because with ProcArrayLock
contention reduced, WALInsertLock contention should be the whole
banana. But:
resultswp.flexlock.32:tps = 11628.279679 (including connections establishing)
resultswp.flexlock.32:tps = 11556.254524 (including connections establishing)
resultswp.flexlock.32:tps = 11606.840931 (including connections establishing)
resultswp.flexlock-xloginsert-padding.32:tps = 11357.904459 (including
connections establishing)
resultswp.flexlock-xloginsert-padding.32:tps = 11226.538104 (including
connections establishing)
resultswp.flexlock-xloginsert-padding.32:tps = 11187.932415 (including
connections establishing)
That's about a 3.3% slowdown.
resultswp.flexlock.80:tps = 11560.508772 (including connections establishing)
resultswp.flexlock.80:tps = 11673.255674 (including connections establishing)
resultswp.flexlock.80:tps = 11597.229252 (including connections establishing)
resultswp.flexlock-xloginsert-padding.80:tps = 11092.549726 (including
connections establishing)
resultswp.flexlock-xloginsert-padding.80:tps = 11179.927207 (including
connections establishing)
resultswp.flexlock-xloginsert-padding.80:tps = 11112.771332 (including
connections establishing)
That's even a little worse.
One possible explanation is that I don't believe that what you've done
here is actually sufficient to guarantee alignment on cache-line
boundary - ShmemAlloc() only guarantees MAXALIGN alignment unless the
allocation is at least BLCKSZ bytes. So depending on exactly how much
shared memory gets allocated before XLogCtlData gets allocated, it's
possible that the change could actually end up splitting all of the
things you're trying to put on their own cache line across two cache
lines. Might be worth fixing that and running this through its paces
again.
(I wonder if we shouldn't just align every shared memory allocation to
64 or 128 bytes. It wouldn't waste much memory and it would make us
much more resistant to performance changes caused by minor
modifications to the shared memory layout.)
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Wed, Nov 16, 2011 at 8:42 PM, Robert Haas <robertmhaas@gmail.com> wrote:
Taking the median of those five results, the patch seems to have sped
things up by 0.3%. At 80 clients:
Thanks for doing that. Even if we fix it as you suggest it seems to
indicate that the WALInsertLock contention is real rather than false
contention. Which lends some clues as to how to proceed.
One possible explanation is that I don't believe that what you've done
here is actually sufficient to guarantee alignment on cache-line
boundary - ShmemAlloc() only guarantees MAXALIGN alignment unless the
allocation is at least BLCKSZ bytes. So depending on exactly how much
shared memory gets allocated before XLogCtlData gets allocated, it's
possible that the change could actually end up splitting all of the
things you're trying to put on their own cache line across two cache
lines. Might be worth fixing that and running this through its paces
again.(I wonder if we shouldn't just align every shared memory allocation to
64 or 128 bytes. It wouldn't waste much memory and it would make us
much more resistant to performance changes caused by minor
modifications to the shared memory layout.)
I'll look at that, thanks for the suggestion.
--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
On Wed, Nov 16, 2011 at 5:15 PM, Simon Riggs <simon@2ndquadrant.com> wrote:
On Wed, Nov 16, 2011 at 8:42 PM, Robert Haas <robertmhaas@gmail.com> wrote:
Taking the median of those five results, the patch seems to have sped
things up by 0.3%. At 80 clients:Thanks for doing that. Even if we fix it as you suggest it seems to
indicate that the WALInsertLock contention is real rather than false
contention. Which lends some clues as to how to proceed.
Sure thing.
My general impression from playing around with this over the last 6-7
months is that cache line sharing is just not that big a problem for
us. In a case like WALInsertLock, I'm fairly certain that we're
actually putting processes to sleep on a regular basis - and the
overhead of a system call to go to sleep and another one to wake up
and the associated context switches dwarfs the cost of passing the
cache line around. It's far more important to get rid of the sleeping
(which, sadly, is far harder than padding out the data structures).
There are some cases where the cache line really does seem to matter -
e.g. Pavan's PGPROC_MINIMAL patch delivers excellent results on
Itanium - but those seem to be fairly rate FWICT.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Robert Haas <robertmhaas@gmail.com> writes:
(I wonder if we shouldn't just align every shared memory allocation to
64 or 128 bytes. It wouldn't waste much memory and it would make us
much more resistant to performance changes caused by minor
modifications to the shared memory layout.)
I could get behind this idea if we had a reasonably clear idea of the
hardware's cache line width, but AFAIK there is no portable way to
identify that. (This is a pretty fatal objection to Simon's original
patch as well...)
regards, tom lane
On Wed, Nov 16, 2011 at 11:11 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
Robert Haas <robertmhaas@gmail.com> writes:
(I wonder if we shouldn't just align every shared memory allocation to
64 or 128 bytes. It wouldn't waste much memory and it would make us
much more resistant to performance changes caused by minor
modifications to the shared memory layout.)I could get behind this idea if we had a reasonably clear idea of the
hardware's cache line width, but AFAIK there is no portable way to
identify that. (This is a pretty fatal objection to Simon's original
patch as well...)
I don't think it's very important to know the exact size. As far as I
can tell, x64 is 64 bytes and Itanium and Power are 128 bytes. If you
optimize for those, you'll also handle any smaller size (that's a
power of two, without which a lot of things we do are wrongheaded)
without wasting much memory. If you run into hardware with a giant
256-byte or large cache line, you might have some sharing, but you
can't win 'em all.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company