From d8b13074e674a0b61c65e7d0bf61c3ff95df8f55 Mon Sep 17 00:00:00 2001 From: Melih Mutlu Date: Wed, 24 Jul 2024 17:19:07 +0300 Subject: [PATCH v1] Use pg_pwritev() in XlogWrite() XlogWrite() had to write() whenever it reached to the last buffer in circular wal buffers before circling back to the first buffer as previous pages in wal buffers is not contiguous in memory. Vectored IO allow us to write when pages are not contiguous. This patch utilizes pg_pwritev() in XlogWrite() so that it is able to circle back without having to write() anything if not necessary. --- src/backend/access/transam/xlog.c | 44 ++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ee0fb0e28f..05d46b9d14 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2315,6 +2315,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) bool ispartialpage; bool last_iteration; bool finishing_seg; + bool full_cycle; int curridx; int npages; int startidx; @@ -2407,17 +2408,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) /* * Dump the set if this will be the last loop iteration, or if we are - * at the last page of the cache area (since the next page won't be - * contiguous in memory), or if we are at the end of the logfile - * segment. + * completed a full cycle in our circular wal buffers, or if we are at + * the end of the logfile segment. */ last_iteration = WriteRqst.Write <= LogwrtResult.Write; finishing_seg = !ispartialpage && (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size; + /* + * Reaching the buffer right before the start buffer means that we + * completed a full cycle in our circular wal buffers. + */ + full_cycle = curridx == (startidx - 1); + if (last_iteration || - curridx == XLogCtl->XLogCacheBlck || + full_cycle || finishing_seg) { char *from; @@ -2425,9 +2431,35 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) Size nleft; ssize_t written; instr_time start; + struct iovec iov[2]; + int iovcnt; + + if (curridx < startidx) + { + Assert(curridx + 1 + XLogCtl->XLogCacheBlck - startidx + 1 == npages); + + /* + * From startidx to the end until the next page is not contiguous + * in memory anymore. + */ + iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; + iov[0].iov_len = (XLogCtl->XLogCacheBlck - startidx + 1) * (Size) XLOG_BLCKSZ; + + /* From first wal buffer to the current idx */ + iov[1].iov_base = XLogCtl->pages; + iov[1].iov_len = (curridx + 1) * (Size) XLOG_BLCKSZ; + + iovcnt = 2; + } + else + { + /* Contiguous case */ + iov[0].iov_base = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;; + iov[0].iov_len = npages * (Size) XLOG_BLCKSZ; + iovcnt = 1; + } /* OK to write the page(s) */ - from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ; nleft = nbytes; do @@ -2441,7 +2473,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) INSTR_TIME_SET_ZERO(start); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = pg_pwrite(openLogFile, from, nleft, startoffset); + written = pg_pwritev(openLogFile, iov, iovcnt, startoffset); pgstat_report_wait_end(); /* -- 2.34.1