Removing a few more lseek() calls
Hello,
Patch 0001 gets rid of the unconditional lseek() calls for SLRU I/O,
as a small follow-up to commit c24dcd0c. Patch 0002 gets rid of a few
places that usually do a good job of avoiding lseek() calls while
reading and writing WAL, but it seems better to have no code at all.
--
Thomas Munro
https://enterprisedb.com
Attachments:
0001-Use-pg_pread-and-pg_pwrite-in-slru.c.patchtext/x-patch; charset=US-ASCII; name=0001-Use-pg_pread-and-pg_pwrite-in-slru.c.patchDownload
From 68c1ec0531f802d39c80b8fa5c310c3c70d795d9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 30 Mar 2019 17:04:08 +1300
Subject: [PATCH 1/2] Use pg_pread() and pg_pwrite() in slru.c.
This avoids lseek() system calls at every SLRU I/O, as was
done for relation files in commit c24dcd0c.
Author: Thomas Munro
Discussion:
---
src/backend/access/transam/slru.c | 25 ++++---------------------
1 file changed, 4 insertions(+), 21 deletions(-)
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 974d42fc86..6527cfce3b 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -647,7 +647,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
- int offset = rpageno * BLCKSZ;
+ off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd;
@@ -677,17 +677,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
return true;
}
- if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
- {
- slru_errcause = SLRU_SEEK_FAILED;
- slru_errno = errno;
- CloseTransientFile(fd);
- return false;
- }
-
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
- if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+ if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
{
pgstat_report_wait_end();
slru_errcause = SLRU_READ_FAILED;
@@ -727,7 +719,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
- int offset = rpageno * BLCKSZ;
+ off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
@@ -837,18 +829,9 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
}
}
- if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
- {
- slru_errcause = SLRU_SEEK_FAILED;
- slru_errno = errno;
- if (!fdata)
- CloseTransientFile(fd);
- return false;
- }
-
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
- if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+ if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
{
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */
--
2.21.0
0002-Use-pg_pread-and-pg_pwrite-in-various-places.patchtext/x-patch; charset=US-ASCII; name=0002-Use-pg_pread-and-pg_pwrite-in-various-places.patchDownload
From 446007aed7fb1bafb125eaca5569f62acdf41069 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 30 Mar 2019 17:33:30 +1300
Subject: [PATCH 2/2] Use pg_pread() and pg_pwrite() in various places.
Remove several copies of some code that usually does a good job of
avoiding lseek() calls, but it seems better to have no code at all.
Also remove a simple case of an unconditional lseek() call.
Author: Thomas Munro
Discussion:
---
src/backend/access/heap/rewriteheap.c | 9 +--------
src/backend/access/transam/xlogutils.c | 25 ++-----------------------
src/backend/replication/walreceiver.c | 21 +++------------------
src/backend/replication/walsender.c | 23 ++++-------------------
src/bin/pg_waldump/pg_waldump.c | 25 +++----------------------
5 files changed, 13 insertions(+), 90 deletions(-)
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index bce4274362..c121e44719 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1165,13 +1165,6 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
path, (uint32) xlrec->offset)));
pgstat_report_wait_end();
- /* now seek to the position we want to write our data to */
- if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to end of file \"%s\": %m",
- path)));
-
data = XLogRecGetData(r) + sizeof(*xlrec);
len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
@@ -1179,7 +1172,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
/* write out tail end of mapping file (again) */
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE);
- if (write(fd, data, len) != len)
+ if (pg_pwrite(fd, data, len, xlrec->offset) != len)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 10a663bae6..a63bfed3a6 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -664,7 +664,6 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr,
static int sendFile = -1;
static XLogSegNo sendSegNo = 0;
static TimeLineID sendTLI = 0;
- static uint32 sendOff = 0;
Assert(segsize == wal_segment_size);
@@ -708,28 +707,9 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr,
errmsg("could not open file \"%s\": %m",
path)));
}
- sendOff = 0;
sendTLI = tli;
}
- /* Need to seek in the file? */
- if (sendOff != startoff)
- {
- if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
- {
- char path[MAXPGPATH];
- int save_errno = errno;
-
- XLogFilePath(path, tli, sendSegNo, segsize);
- errno = save_errno;
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek in log segment %s to offset %u: %m",
- path, startoff)));
- }
- sendOff = startoff;
- }
-
/* How many bytes are within this segment? */
if (nbytes > (segsize - startoff))
segbytes = segsize - startoff;
@@ -737,7 +717,7 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr,
segbytes = nbytes;
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
- readbytes = read(sendFile, p, segbytes);
+ readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff);
pgstat_report_wait_end();
if (readbytes <= 0)
{
@@ -749,13 +729,12 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr,
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from log segment %s, offset %u, length %lu: %m",
- path, sendOff, (unsigned long) segbytes)));
+ path, startoff, (unsigned long) segbytes)));
}
/* Update state for read */
recptr += readbytes;
- sendOff += readbytes;
nbytes -= readbytes;
p += readbytes;
}
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index f32cf91ffb..d2c0f964d0 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -83,14 +83,13 @@ WalReceiverFunctionsType *WalReceiverFunctions = NULL;
#define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */
/*
- * These variables are used similarly to openLogFile/SegNo/Off,
+ * These variables are used similarly to openLogFile/SegNo,
* but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID
* corresponding the filename of recvFile.
*/
static int recvFile = -1;
static TimeLineID recvFileTLI = 0;
static XLogSegNo recvSegNo = 0;
-static uint32 recvOff = 0;
/*
* Flags set by interrupt handlers of walreceiver for later service in the
@@ -971,7 +970,6 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
use_existent = true;
recvFile = XLogFileInit(recvSegNo, &use_existent, true);
recvFileTLI = ThisTimeLineID;
- recvOff = 0;
}
/* Calculate the start offset of the received logs */
@@ -982,22 +980,10 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
else
segbytes = nbytes;
- /* Need to seek in the file? */
- if (recvOff != startoff)
- {
- if (lseek(recvFile, (off_t) startoff, SEEK_SET) < 0)
- ereport(PANIC,
- (errcode_for_file_access(),
- errmsg("could not seek in log segment %s to offset %u: %m",
- XLogFileNameP(recvFileTLI, recvSegNo),
- startoff)));
- recvOff = startoff;
- }
-
/* OK to write the logs */
errno = 0;
- byteswritten = write(recvFile, buf, segbytes);
+ byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff);
if (byteswritten <= 0)
{
/* if write didn't set errno, assume no disk space */
@@ -1008,13 +994,12 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
errmsg("could not write to log segment %s "
"at offset %u, length %lu: %m",
XLogFileNameP(recvFileTLI, recvSegNo),
- recvOff, (unsigned long) segbytes)));
+ startoff, (unsigned long) segbytes)));
}
/* Update state for write */
recptr += byteswritten;
- recvOff += byteswritten;
nbytes -= byteswritten;
buf += byteswritten;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 21f5c868f1..bc6d4bc500 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -129,12 +129,11 @@ bool log_replication_commands = false;
bool wake_wal_senders = false;
/*
- * These variables are used similarly to openLogFile/SegNo/Off,
+ * These variables are used similarly to openLogFile/SegNo,
* but for walsender to read the XLOG.
*/
static int sendFile = -1;
static XLogSegNo sendSegNo = 0;
-static uint32 sendOff = 0;
/* Timeline ID of the currently open file */
static TimeLineID curFileTimeLine = 0;
@@ -2441,19 +2440,6 @@ retry:
errmsg("could not open file \"%s\": %m",
path)));
}
- sendOff = 0;
- }
-
- /* Need to seek in the file? */
- if (sendOff != startoff)
- {
- if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek in log segment %s to offset %u: %m",
- XLogFileNameP(curFileTimeLine, sendSegNo),
- startoff)));
- sendOff = startoff;
}
/* How many bytes are within this segment? */
@@ -2463,7 +2449,7 @@ retry:
segbytes = nbytes;
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
- readbytes = read(sendFile, p, segbytes);
+ readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff);
pgstat_report_wait_end();
if (readbytes < 0)
{
@@ -2471,7 +2457,7 @@ retry:
(errcode_for_file_access(),
errmsg("could not read from log segment %s, offset %u, length %zu: %m",
XLogFileNameP(curFileTimeLine, sendSegNo),
- sendOff, (Size) segbytes)));
+ startoff, (Size) segbytes)));
}
else if (readbytes == 0)
{
@@ -2479,13 +2465,12 @@ retry:
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("could not read from log segment %s, offset %u: read %d of %zu",
XLogFileNameP(curFileTimeLine, sendSegNo),
- sendOff, readbytes, (Size) segbytes)));
+ startoff, readbytes, (Size) segbytes)));
}
/* Update state for read */
recptr += readbytes;
- sendOff += readbytes;
nbytes -= readbytes;
p += readbytes;
}
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 1e5379eb3e..35216b4550 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -329,7 +329,6 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id,
static int sendFile = -1;
static XLogSegNo sendSegNo = 0;
- static uint32 sendOff = 0;
p = buf;
recptr = startptr;
@@ -384,23 +383,6 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id,
if (sendFile < 0)
fatal_error("could not find file \"%s\": %s",
fname, strerror(errno));
- sendOff = 0;
- }
-
- /* Need to seek in the file? */
- if (sendOff != startoff)
- {
- if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
- {
- int err = errno;
- char fname[MAXPGPATH];
-
- XLogFileName(fname, timeline_id, sendSegNo, WalSegSz);
-
- fatal_error("could not seek in log file %s to offset %u: %s",
- fname, startoff, strerror(err));
- }
- sendOff = startoff;
}
/* How many bytes are within this segment? */
@@ -409,7 +391,7 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id,
else
segbytes = nbytes;
- readbytes = read(sendFile, p, segbytes);
+ readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff);
if (readbytes <= 0)
{
int err = errno;
@@ -421,16 +403,15 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id,
if (readbytes < 0)
fatal_error("could not read from log file %s, offset %u, length %d: %s",
- fname, sendOff, segbytes, strerror(err));
+ fname, startoff, segbytes, strerror(err));
else if (readbytes == 0)
fatal_error("could not read from log file %s, offset %u: read %d of %zu",
- fname, sendOff, readbytes, (Size) segbytes);
+ fname, startoff, readbytes, (Size) segbytes);
}
/* Update state for read */
recptr += readbytes;
- sendOff += readbytes;
nbytes -= readbytes;
p += readbytes;
}
--
2.21.0
On Sat, Mar 30, 2019 at 2:14 AM Thomas Munro <thomas.munro@gmail.com> wrote:
Hello,
Patch 0001 gets rid of the unconditional lseek() calls for SLRU I/O,
as a small follow-up to commit c24dcd0c. Patch 0002 gets rid of a few
places that usually do a good job of avoiding lseek() calls while
reading and writing WAL, but it seems better to have no code at all.
I reviewed the changes and they look good to me. Code looks much cleaner
after 2nd patch.
After these changes, only one usage of SLRU_SEEK_FAILED remains in
SimpleLruDoesPhysicalPageExist().