Speed up COMMIT PREPARED
Hi,
I noticed that COMMIT PREPARED command is slow in the discussion [1]/messages/by-id/20191206.173215.1818665441859410805.horikyota.ntt@gmail.com.
First, I made the following simple script for pgbench.
``` prepare.pgbench
\set id random(1, 1000000)
BEGIN;
UPDATE test_table SET md5 = md5(clock_timestamp()::text) WHERE id = :id;
PREPARE TRANSACTION 'prep_:client_id';
COMMIT PREPARED 'prep_:client_id';
```
I run the pgbench as follows.
```
pgbench -f prepare.pgbench -c 1 -j 1 -T 60 -d postgres -r
```
The result is following.
<Result in master branch>
tps: 287.259
Latency:
UPDATE 0.207ms
PREPARE TRANSACTION 0.212ms
COMMIT PREPARED 2.982ms
Next, I analyzed the bottleneck using pstack and strace.
I noticed that the open() during COMMIT PREPARED takes 2.7ms.
Furthermore, I noticed that the backend process almost always open the same wal segment file.
When COMMIT PREPARED command, there are two ways to find 2PC state data.
- If it is stored in wal segment file, open and read wal segment file.
- If not, read 2PC state file
The above script runs COMMIT PREPARED command just after PREPARE TRANSACTION command.
I think it also won't take long time for XA transaction to run COMMIT PREPARED command after running PREPARE TRANSACTION command.
Therefore, I think that the wal segment file which is opened during COMMIT PREPARED probably be the current wal segment file.
To speed up COMMIT PREPARED command, I made two patches for test.
(1) Hold_xlogreader.patch
Skip closing wal segment file after COMMIT PREPARED command completed.
If the next COMMIT PREPARED command use the same wal segment file, it is fast since the process need not to open wal segment file.
However, I don't know when we should close the wal segment file.
Moreover, it might not be useful when COMMIT PREPARED command is run not so often and use different wal segment file each time.
<Result in Hold_xlogreader.patch>
tps: 1750.81
Latency:
UPDATE 0.156ms
PREPARE TRANSACTION 0.184ms
COMMIT PREPARED 0.179ms
(2) Read_from_walbuffer.patch
Read the data from wal buffer if there are still in wal buffer.
If COMMIT PREPARED command is run just after PREPARE TRANSACTION command, the wal may be in the wal buffer.
However, the period which the wal is in the wal buffer is not so long since wal writer recycle the wal buffer soon.
Moreover, it may affect the other performance such as UPDATE since it needs to take lock on wal buffer.
<Result in Read_from_walbuffer.patch>
tps: 446.371
Latency:
UPDATE 0.187ms
PREPARE TRANSACTION 0.196ms
COMMIT PREPARED 1.974ms
Which approach do you think better?
[1]: /messages/by-id/20191206.173215.1818665441859410805.horikyota.ntt@gmail.com
Regards,
Ryohei Takahashi
Attachments:
Hold_xlogreader.patchapplication/octet-stream; name=Hold_xlogreader.patchDownload
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 29980d56ac..f67244183b 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1355,15 +1355,25 @@ static void
XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
{
XLogRecord *record;
- XLogReaderState *xlogreader;
+ static XLogReaderState *xlogreader = NULL;
char *errormsg;
TimeLineID save_currtli = ThisTimeLineID;
- xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
- XL_ROUTINE(.page_read = &read_local_xlog_page,
- .segment_open = &wal_segment_open,
- .segment_close = &wal_segment_close),
- NULL);
+ if (!xlogreader)
+ {
+ /*
+ * Create xlogreader on TopMemoryContext to prevent open the same
+ * wal segment many times.
+ */
+ MemoryContext oldctx = MemoryContextSwitchTo(TopMemoryContext);
+ xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
+ XL_ROUTINE(.page_read = &read_local_xlog_page,
+ .segment_open = &wal_segment_open,
+ .segment_close = &wal_segment_close),
+ NULL);
+ MemoryContextSwitchTo(oldctx);
+ }
+
if (!xlogreader)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -1398,8 +1408,6 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
*buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
-
- XLogReaderFree(xlogreader);
}
Read_from_walbuffer.patchapplication/octet-stream; name=Read_from_walbuffer.patchDownload
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c7c928f50b..72868779e9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -12989,3 +12989,46 @@ XLogRequestWalReceiverReply(void)
{
doRequestWalReceiverReply = true;
}
+
+/*
+ * Read the data from WAL buffer
+ * If the page we want is already recycled, return false
+ */
+bool
+XLogReadBuffer(char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+ int idx;
+ uint32 offset;
+ XLogPageHeader TargetPage;
+
+ /*
+ * First, we should prevent AdvanceXLInsertBuffer
+ */
+ LWLockAcquire(WALBufMappingLock, LW_SHARED);
+
+ /*
+ * Check wheter the page we want is still on WAL buffer
+ */
+ idx = XLogRecPtrToBufIdx(startptr);
+ TargetPage = (XLogPageHeader) (XLogCtl->pages + idx * (Size) XLOG_BLCKSZ);
+
+ if (TargetPage->xlp_tli != tli || TargetPage->xlp_pageaddr > startptr
+ || TargetPage->xlp_pageaddr + (Size) XLOG_BLCKSZ < startptr + count)
+ {
+ LWLockRelease(WALBufMappingLock);
+ return false;
+ }
+
+ offset = startptr % XLOG_BLCKSZ;
+
+ /*
+ * Acquire lock to prevent the other process from write WAL buffer anymore,
+ * then read
+ */
+ WALInsertLockAcquireExclusive();
+ memcpy(buf, TargetPage + offset, count);
+ WALInsertLockRelease();
+
+ LWLockRelease(WALBufMappingLock);
+ return true;
+}
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5cf74e181a..6b55fbc880 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -33,6 +33,7 @@
#ifndef FRONTEND
#include "miscadmin.h"
#include "pgstat.h"
+#include "access/xlog.h"
#include "utils/memutils.h"
#endif
@@ -1054,9 +1055,6 @@ err:
*
* Returns true if succeeded, false if an error occurs, in which case
* 'errinfo' receives error details.
- *
- * XXX probably this should be improved to suck data directly from the
- * WAL buffers when possible.
*/
bool
WALRead(XLogReaderState *state,
@@ -1067,6 +1065,14 @@ WALRead(XLogReaderState *state,
XLogRecPtr recptr;
Size nbytes;
+#ifndef FRONTEND
+ /*
+ * If the data we want seems to be in one page, we first try to read from WAL buffer
+ */
+ if (count < (Size) XLOG_BLCKSZ && XLogReadBuffer(buf, startptr, count, tli))
+ return true;
+#endif
+
p = buf;
recptr = startptr;
nbytes = count;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ccfcf43d62..328c380d71 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -369,6 +369,8 @@ extern void XLogRequestWalReceiverReply(void);
extern void assign_max_wal_size(int newval, void *extra);
extern void assign_checkpoint_completion_target(double newval, void *extra);
+extern bool XLogReadBuffer(char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+
/*
* Routines to start, stop, and get status of a base backup.
*/
I noticed that the previous Read_from_walbuffer.patch has a mistake in xlogreader.c.
Could you please use the attached v2 patch?
The performance result of the previous mail is the result of v2 patch.
Regards,
Ryohei Takahashi
Attachments:
v2-Read_from_walbuffer.patchapplication/octet-stream; name=v2-Read_from_walbuffer.patchDownload
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c7c928f50b..72868779e9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -12989,3 +12989,46 @@ XLogRequestWalReceiverReply(void)
{
doRequestWalReceiverReply = true;
}
+
+/*
+ * Read the data from WAL buffer
+ * If the page we want is already recycled, return false
+ */
+bool
+XLogReadBuffer(char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+ int idx;
+ uint32 offset;
+ XLogPageHeader TargetPage;
+
+ /*
+ * First, we should prevent AdvanceXLInsertBuffer
+ */
+ LWLockAcquire(WALBufMappingLock, LW_SHARED);
+
+ /*
+ * Check wheter the page we want is still on WAL buffer
+ */
+ idx = XLogRecPtrToBufIdx(startptr);
+ TargetPage = (XLogPageHeader) (XLogCtl->pages + idx * (Size) XLOG_BLCKSZ);
+
+ if (TargetPage->xlp_tli != tli || TargetPage->xlp_pageaddr > startptr
+ || TargetPage->xlp_pageaddr + (Size) XLOG_BLCKSZ < startptr + count)
+ {
+ LWLockRelease(WALBufMappingLock);
+ return false;
+ }
+
+ offset = startptr % XLOG_BLCKSZ;
+
+ /*
+ * Acquire lock to prevent the other process from write WAL buffer anymore,
+ * then read
+ */
+ WALInsertLockAcquireExclusive();
+ memcpy(buf, TargetPage + offset, count);
+ WALInsertLockRelease();
+
+ LWLockRelease(WALBufMappingLock);
+ return true;
+}
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5cf74e181a..28d584409e 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -33,6 +33,7 @@
#ifndef FRONTEND
#include "miscadmin.h"
#include "pgstat.h"
+#include "access/xlog.h"
#include "utils/memutils.h"
#endif
@@ -1054,9 +1055,6 @@ err:
*
* Returns true if succeeded, false if an error occurs, in which case
* 'errinfo' receives error details.
- *
- * XXX probably this should be improved to suck data directly from the
- * WAL buffers when possible.
*/
bool
WALRead(XLogReaderState *state,
@@ -1067,6 +1065,14 @@ WALRead(XLogReaderState *state,
XLogRecPtr recptr;
Size nbytes;
+#ifndef FRONTEND
+ /*
+ * If the data we want seems to be in one page, we first try to read from WAL buffer
+ */
+ if (count <= (Size) XLOG_BLCKSZ && XLogReadBuffer(buf, startptr, count, tli))
+ return true;
+#endif
+
p = buf;
recptr = startptr;
nbytes = count;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ccfcf43d62..328c380d71 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -369,6 +369,8 @@ extern void XLogRequestWalReceiverReply(void);
extern void assign_max_wal_size(int newval, void *extra);
extern void assign_checkpoint_completion_target(double newval, void *extra);
+extern bool XLogReadBuffer(char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+
/*
* Routines to start, stop, and get status of a base backup.
*/
Hi,
I noticed that anti-virus software slow down the open().
I stopped the anti-virus software and re-run the test.
(Average of 10 times)
master: 1924tps
Hold_xlogreader.patch: 1993tps (+3.5%)
Read_from_walbuffer.patch: 1954tps(+1.5%)
Therefore, the effect of my patch is limited.
I'm sorry for the confusion.
Regards,
Ryohei Takahashi