A patch for xlog.c

Started by Matthew Kirkwoodalmost 25 years ago23 messages
#1Matthew Kirkwood
matthew@hairy.beasts.org

Hi,

Here is a patch against 7.1beta5 to use mmap(), and thus a
single write, to initialise xlogs. It may well improve
performance of this on platforms/filesystems which write
metadata synchronously.

It needs a configure test, but certainly builds and runs
OK.

It also wraps the file reopening in an "ifdef WIN32", since
it certainly isn't needed for UNIX-like platforms (which I
assume includes BeOS).

Matthew.

diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c postgresql-7.1beta5/src/backend/access/transam/xlog.c
--- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c	Fri Feb 23 18:12:00 2001
+++ postgresql-7.1beta5/src/backend/access/transam/xlog.c	Sat Feb 24 15:23:41 2001
@@ -24,6 +24,10 @@
 #include <locale.h>
 #endif
+#ifdef	_HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/catversion.h"
@@ -36,6 +40,7 @@
 #include "access/xlogutils.h"
 #include "utils/builtins.h"
 #include "utils/relcache.h"
+#include "utils/pfile.h"

#include "miscadmin.h"

@@ -53,6 +58,10 @@
StartUpID ThisStartUpID = 0;
XLogRecPtr RedoRecPtr;

+#ifdef	_HAVE_MMAP
+void		*zmmap = NULL;
+#endif
+
 int			XLOG_DEBUG = 0;

/* To read/update control file and create new log file */
@@ -955,7 +964,6 @@
{
char path[MAXPGPATH];
char tpath[MAXPGPATH];
- char zbuffer[BLCKSZ];
int fd;
int nbytes;

@@ -987,28 +995,36 @@
elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
logId, logSeg);

-	/*
-	 * Zero-fill the file.  We have to do this the hard way to ensure that
-	 * all the file space has really been allocated --- on platforms that
-	 * allow "holes" in files, just seeking to the end doesn't allocate
-	 * intermediate space.  This way, we know that we have all the space
-	 * and (after the fsync below) that all the indirect blocks are down
-	 * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
-	 * writes to the log file.
-	 */
-	MemSet(zbuffer, 0, sizeof(zbuffer));
-	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+#ifdef	_HAVE_MMAP
+	if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
+#endif
 	{
-		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
-			elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
-				 logId, logSeg);
+		/*
+	 	* Zero-fill the file.  We have to do this the hard way to ensure that
+	 	* all the file space has really been allocated --- on platforms that
+	 	* allow "holes" in files, just seeking to the end doesn't allocate
+	 	* intermediate space.  This way, we know that we have all the space
+	 	* and (after the fsync below) that all the indirect blocks are down
+	 	* on disk.  Therefore, fdatasync(2) will be sufficient to sync future
+	 	* writes to the log file.
+	 	*/
+		char		zbuffer[BLCKSZ];
+		MemSet(zbuffer, 0, sizeof(zbuffer));
+		for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+		{
+			if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
+				elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
+				 	logId, logSeg);
+		}
 	}

if (pg_fsync(fd) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);

+#ifdef	WIN32
 	close(fd);
+#endif

/*
* Prefer link() to rename() here just to be sure that we don't overwrite
@@ -1026,10 +1042,12 @@
logId, logSeg);
#endif

+#ifdef WIN32
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
logId, logSeg);
+#endif

 	return (fd);
 }
@@ -1255,11 +1273,8 @@
 	if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
 	{
 		readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
 				 readId, readSeg, readOff);
 		if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
 		{
@@ -1415,19 +1430,13 @@
 		elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
 			 readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
 		readFile = XLogFileOpen(readId, readSeg, false);
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
 				 readId, readSeg, readOff);
 		memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
 			   BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
+		if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
 				 readId, readSeg, readOff);
 		readOff++;
 	}
@@ -1797,6 +1806,28 @@
 	return buf;
 }
+
+#ifdef	_HAVE_MMAP
+static void
+ZeroMapInit(void)
+{
+	int zfd;
+
+	zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
+	if (zfd < 0) {
+		elog(LOG, "Can't open /dev/zero: %m");
+		return;
+	}
+	zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
+	if (!zmmap)
+		elog(LOG, "Can't mmap /dev/zero: %m");
+	close(zfd);
+}
+#else
+#define	ZeroMapInit()
+#endif
+
+
 /*
  * This func must be called ONCE on system startup
  */
@@ -1811,6 +1842,9 @@
 	char		buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];
 	elog(LOG, "starting up");
+
+	ZeroMapInit();
+
 	CritSectionCount++;

XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));

#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Matthew Kirkwood (#1)
Re: A patch for xlog.c

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

Here is a patch against 7.1beta5 to use mmap(), and thus a
single write, to initialise xlogs. It may well improve
performance of this on platforms/filesystems which write
metadata synchronously.

Have you *demonstrated* any actual performance improvement from this?
How much? On what platforms?

I don't believe in adding unportable alternative implementations without
pretty darn compelling reasons ...

regards, tom lane

#3Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Tom Lane (#2)
1 attachment(s)
Re: A patch for xlog.c

On Sat, 24 Feb 2001, Tom Lane wrote:

Here is a patch against 7.1beta5 to use mmap(), and thus a
single write, to initialise xlogs. It may well improve
performance of this on platforms/filesystems which write
metadata synchronously.

Have you *demonstrated* any actual performance improvement from this?
How much? On what platforms?

Forgive me if I posted it to the wrong place -- I was far from
proposing this for inclusion. It is but a small step on the
way to my plan of mmap()ifying all of the WAL stuff (which may
also prove a waste of effort).

On Linux 2.4 w/asynchronous ext2, it's good for about 5%, which
certainly wouldn't alone be worth the effort. I tried synchronous
ext2, but the numbers were so poor with both that nobody who cared
about performance would be using it (1.2 sec per file, vs. over a
minute).

I don't have access to any kind machine running UFS/FFS. Perhaps
someone on the list might do me the favour of trying the attached
test on such a platform with synchronous metadata writes (see top
of file for #ifdefs).

I don't believe in adding unportable alternative implementations
without pretty darn compelling reasons ...

mmap() is hardly unportable. From a quick look, all the current
names in include/port/ (which must surely make up a vast majority
of deployed recent postgresql versions) except QNX and Win32 can
support POSIX mmap.

Thanks for the reply,

Matthew.

Attachments:

writetest.ctext/plain; charset=US-ASCII; name=writetest.cDownload
#4Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Matthew Kirkwood (#1)
Re: A patch for xlog.c

I am confused why mmap() is better than writing to a real file. Don't
we need to write to a real file so it is available for database
recovery?

Hi,

Here is a patch against 7.1beta5 to use mmap(), and thus a
single write, to initialise xlogs. It may well improve
performance of this on platforms/filesystems which write
metadata synchronously.

It needs a configure test, but certainly builds and runs
OK.

It also wraps the file reopening in an "ifdef WIN32", since
it certainly isn't needed for UNIX-like platforms (which I
assume includes BeOS).

Matthew.

diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c postgresql-7.1beta5/src/backend/access/transam/xlog.c
--- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c	Fri Feb 23 18:12:00 2001
+++ postgresql-7.1beta5/src/backend/access/transam/xlog.c	Sat Feb 24 15:23:41 2001
@@ -24,6 +24,10 @@
#include <locale.h>
#endif
+#ifdef	_HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/catversion.h"
@@ -36,6 +40,7 @@
#include "access/xlogutils.h"
#include "utils/builtins.h"
#include "utils/relcache.h"
+#include "utils/pfile.h"

#include "miscadmin.h"

@@ -53,6 +58,10 @@
StartUpID ThisStartUpID = 0;
XLogRecPtr RedoRecPtr;

+#ifdef	_HAVE_MMAP
+void		*zmmap = NULL;
+#endif
+
int			XLOG_DEBUG = 0;

/* To read/update control file and create new log file */
@@ -955,7 +964,6 @@
{
char path[MAXPGPATH];
char tpath[MAXPGPATH];
- char zbuffer[BLCKSZ];
int fd;
int nbytes;

@@ -987,28 +995,36 @@
elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
logId, logSeg);

-	/*
-	 * Zero-fill the file.  We have to do this the hard way to ensure that
-	 * all the file space has really been allocated --- on platforms that
-	 * allow "holes" in files, just seeking to the end doesn't allocate
-	 * intermediate space.  This way, we know that we have all the space
-	 * and (after the fsync below) that all the indirect blocks are down
-	 * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
-	 * writes to the log file.
-	 */
-	MemSet(zbuffer, 0, sizeof(zbuffer));
-	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+#ifdef	_HAVE_MMAP
+	if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
+#endif
{
-		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
-			elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
-				 logId, logSeg);
+		/*
+	 	* Zero-fill the file.  We have to do this the hard way to ensure that
+	 	* all the file space has really been allocated --- on platforms that
+	 	* allow "holes" in files, just seeking to the end doesn't allocate
+	 	* intermediate space.  This way, we know that we have all the space
+	 	* and (after the fsync below) that all the indirect blocks are down
+	 	* on disk.  Therefore, fdatasync(2) will be sufficient to sync future
+	 	* writes to the log file.
+	 	*/
+		char		zbuffer[BLCKSZ];
+		MemSet(zbuffer, 0, sizeof(zbuffer));
+		for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+		{
+			if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
+				elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
+				 	logId, logSeg);
+		}
}

if (pg_fsync(fd) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);

+#ifdef	WIN32
close(fd);
+#endif

/*
* Prefer link() to rename() here just to be sure that we don't overwrite
@@ -1026,10 +1042,12 @@
logId, logSeg);
#endif

+#ifdef WIN32
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
logId, logSeg);
+#endif

return (fd);
}
@@ -1255,11 +1273,8 @@
if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
{
readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
readId, readSeg, readOff);
if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
{
@@ -1415,19 +1430,13 @@
elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
readFile = XLogFileOpen(readId, readSeg, false);
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
readId, readSeg, readOff);
memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
-		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-				 readId, readSeg, readOff);
-		if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
-			elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
+		if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+			elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
readId, readSeg, readOff);
readOff++;
}
@@ -1797,6 +1806,28 @@
return buf;
}
+
+#ifdef	_HAVE_MMAP
+static void
+ZeroMapInit(void)
+{
+	int zfd;
+
+	zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
+	if (zfd < 0) {
+		elog(LOG, "Can't open /dev/zero: %m");
+		return;
+	}
+	zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
+	if (!zmmap)
+		elog(LOG, "Can't mmap /dev/zero: %m");
+	close(zfd);
+}
+#else
+#define	ZeroMapInit()
+#endif
+
+
/*
* This func must be called ONCE on system startup
*/
@@ -1811,6 +1842,9 @@
char		buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];
elog(LOG, "starting up");
+
+	ZeroMapInit();
+
CritSectionCount++;

XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#5Tom Lane
tgl@sss.pgh.pa.us
In reply to: Matthew Kirkwood (#3)
Re: A patch for xlog.c

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

Forgive me if I posted it to the wrong place -- I was far from
proposing this for inclusion.

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch. If this is only an experiment it had best
be clearly labeled as such.

It is but a small step on the way to my plan of mmap()ifying all of
the WAL stuff (which may also prove a waste of effort).

Very probably. What are your grounds for thinking that's a good idea?
I can't see any reason to think that mmap is more efficient than write
for simple sequential writes, which is what we need to do.

regards, tom lane

#6Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Bruce Momjian (#4)
Re: A patch for xlog.c

On Sat, 24 Feb 2001, Bruce Momjian wrote:

I am confused why mmap() is better than writing to a real file.

It isn't, except that it allows to initialise the logfile in
one syscall, without first allocating and zeroing (and hence
dirtying) 16Mb of memory.

Don't we need to write to a real file so it is available for database
recovery?

The mmap isn't used for the destination, but for the source;
it's just a cheap way to get your hands on 16Mb of zeroes.

Matthew.

#7Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Tom Lane (#5)
Re: A patch for xlog.c

On Sat, 24 Feb 2001, Tom Lane wrote:

Forgive me if I posted it to the wrong place -- I was far from
proposing this for inclusion.

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch. If this is only an experiment it had best
be clearly labeled as such.

OK. Is there are better place for discussion of such?

It is but a small step on the way to my plan of mmap()ifying all
of the WAL stuff (which may also prove a waste of effort).

Very probably. What are your grounds for thinking that's a good idea?
I can't see any reason to think that mmap is more efficient than write
for simple sequential writes, which is what we need to do.

Potential pros:

a. msync(MS_ASYNC) seems to be exactly
b. Potential to reduce contention
c. Removing syscalls is rarely a bad thing
d. Fewer copies, better cache behaviour

Potential cons:

a. Portability
b. A bad pointer can cause a scribble on the log

Matthew.

#8Tom Lane
tgl@sss.pgh.pa.us
In reply to: Matthew Kirkwood (#7)
Re: A patch for xlog.c

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch. If this is only an experiment it had best
be clearly labeled as such.

OK. Is there are better place for discussion of such?

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

[ possible merits of mmap ]

Let's take up that discussion in pghackers.

regards, tom lane

#9Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Tom Lane (#8)
Re: A patch for xlog.c

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch. If this is only an experiment it had best
be clearly labeled as such.

OK. Is there are better place for discussion of such?

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

[ possible merits of mmap ]

Let's take up that discussion in pghackers.

I always felt the real benefit of mmap() would be to remove use of SysV
shared memory and use anon mmap() to prevent problems with SysV share
memory limits.

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#10Tom Lane
tgl@sss.pgh.pa.us
In reply to: Matthew Kirkwood (#6)
mmap for zeroing WAL log

[ redirected to pgsql-hackers instead of -patches ]

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

On Sat, 24 Feb 2001, Bruce Momjian wrote:

I am confused why mmap() is better than writing to a real file.

It isn't, except that it allows to initialise the logfile in
one syscall, without first allocating and zeroing (and hence
dirtying) 16Mb of memory.

Uh, the existing code does not zero 16Mb of memory... it zeroes
8K and then writes that block repeatedly. It's possible that the
overhead of a syscall for each 8K block is significant, but on the
other hand writing a block at a time is a heavily used and heavily
optimized path in all Unixen. It's at least as plausible that the
mmap-as-source-of-zeroes path will be slower!

I think this is worth looking into, but I'm very far from being
sold on it...

regards, tom lane

#11The Hermit Hacker
scrappy@hub.org
In reply to: Bruce Momjian (#9)
Re: A patch for xlog.c

On Sat, 24 Feb 2001, Bruce Momjian wrote:

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch. If this is only an experiment it had best
be clearly labeled as such.

OK. Is there are better place for discussion of such?

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

[ possible merits of mmap ]

Let's take up that discussion in pghackers.

I always felt the real benefit of mmap() would be to remove use of SysV
shared memory and use anon mmap() to prevent problems with SysV share
memory limits.

You'll still have memory limits to overcome ... per user memory limits
being one ... there is no such thing as a 'cure-all' ...

#12Bruce Momjian
pgman@candle.pha.pa.us
In reply to: The Hermit Hacker (#11)
Re: A patch for xlog.c

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

[ possible merits of mmap ]

Let's take up that discussion in pghackers.

I always felt the real benefit of mmap() would be to remove use of SysV
shared memory and use anon mmap() to prevent problems with SysV share
memory limits.

You'll still have memory limits to overcome ... per user memory limits
being one ... there is no such thing as a 'cure-all' ...

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#13The Hermit Hacker
scrappy@hub.org
In reply to: Bruce Momjian (#12)
Re: A patch for xlog.c

On Sun, 25 Feb 2001, Bruce Momjian wrote:

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

[ possible merits of mmap ]

Let's take up that discussion in pghackers.

I always felt the real benefit of mmap() would be to remove use of SysV
shared memory and use anon mmap() to prevent problems with SysV share
memory limits.

You'll still have memory limits to overcome ... per user memory limits
being one ... there is no such thing as a 'cure-all' ...

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

well, come up with suitable patches for v7.2 and we can see where it goes
... you seem to think mmap() will do what we require, but, so far, have
been unable to convince anyone to dedicate the time to converting to using
it. "having to raise/set SysV limits", IMHO, isn't worth the overhaul
that I see having to happen, but, if you can show us the benefits of doing
it other then removing a 'one time administrative config' of an OS, I
imagine that nobody will be able to argue it ...

#14Bruce Momjian
pgman@candle.pha.pa.us
In reply to: The Hermit Hacker (#13)
Re: A patch for xlog.c

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

well, come up with suitable patches for v7.2 and we can see where it goes
... you seem to think mmap() will do what we require, but, so far, have
been unable to convince anyone to dedicate the time to converting to using
it. "having to raise/set SysV limits", IMHO, isn't worth the overhaul
that I see having to happen, but, if you can show us the benefits of doing
it other then removing a 'one time administrative config' of an OS, I
imagine that nobody will be able to argue it ...

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap(). Most BSD's support it, but I don't think Linux or others
do.

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#15The Hermit Hacker
scrappy@hub.org
In reply to: Bruce Momjian (#14)
Re: A patch for xlog.c

On Sun, 25 Feb 2001, Bruce Momjian wrote:

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

well, come up with suitable patches for v7.2 and we can see where it goes
... you seem to think mmap() will do what we require, but, so far, have
been unable to convince anyone to dedicate the time to converting to using
it. "having to raise/set SysV limits", IMHO, isn't worth the overhaul
that I see having to happen, but, if you can show us the benefits of doing
it other then removing a 'one time administrative config' of an OS, I
imagine that nobody will be able to argue it ...

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap(). Most BSD's support it, but I don't think Linux or others
do.

ah, then not a low priority, a non-starter, period ... maybe when all the
OSs we support move to supporting ANON mmap() :(

#16Bruce Momjian
pgman@candle.pha.pa.us
In reply to: The Hermit Hacker (#15)
Re: A patch for xlog.c

On Sun, 25 Feb 2001, Bruce Momjian wrote:

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

well, come up with suitable patches for v7.2 and we can see where it goes
... you seem to think mmap() will do what we require, but, so far, have
been unable to convince anyone to dedicate the time to converting to using
it. "having to raise/set SysV limits", IMHO, isn't worth the overhaul
that I see having to happen, but, if you can show us the benefits of doing
it other then removing a 'one time administrative config' of an OS, I
imagine that nobody will be able to argue it ...

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap(). Most BSD's support it, but I don't think Linux or others
do.

ah, then not a low priority, a non-starter, period ... maybe when all the
OSs we support move to supporting ANON mmap() :(

Yea, we would have to take a poll to see if the majority support it.
Right now, I think it is clearly a minority, and not worth the added
confusion for a few platforms.

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#17Peter Eisentraut
peter_e@gmx.net
In reply to: The Hermit Hacker (#15)
Re: A patch for xlog.c

The Hermit Hacker writes:

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap(). Most BSD's support it, but I don't think Linux or others
do.

ah, then not a low priority, a non-starter, period ... maybe when all the
OSs we support move to supporting ANON mmap() :(

It would be worthwhile for those operating systems that don't have SysV
shared memory but do have mmap(). But I don't have one of those, so I
ain't gonna do it. ;-)

--
Peter Eisentraut peter_e@gmx.net http://yi.org/peter-e/

#18Bruce Momjian
pgman@candle.pha.pa.us
In reply to: Peter Eisentraut (#17)
Re: A patch for xlog.c

The Hermit Hacker writes:

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap(). Most BSD's support it, but I don't think Linux or others
do.

ah, then not a low priority, a non-starter, period ... maybe when all the
OSs we support move to supporting ANON mmap() :(

It would be worthwhile for those operating systems that don't have SysV
shared memory but do have mmap(). But I don't have one of those, so I
ain't gonna do it. ;-)

All have SysV memory. mmap() usage is only useful in enabling larger
buffers without kernel changes.

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
#19Peter Eisentraut
peter_e@gmx.net
In reply to: Bruce Momjian (#18)
Re: A patch for xlog.c

Bruce Momjian writes:

All have SysV memory.

All that we currently support...

--
Peter Eisentraut peter_e@gmx.net http://yi.org/peter-e/

#20Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Bruce Momjian (#18)
Re: A patch for xlog.c

On Tue, 27 Feb 2001, Bruce Momjian wrote:

mmap() usage is only useful in enabling larger
buffers without kernel changes.

My plan was not to replace the shared buffer pool with an
mmap()ed area, but rather to use mmap() on the data files
themselves to eliminate it.

Clearly this is rather controversial, since it may have
safety implications, but it should allow the kernel better
to choose what to cache.

Matthew.

#21Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Tom Lane (#10)
Re: mmap for zeroing WAL log

On Sat, 24 Feb 2001, Tom Lane wrote:

I am confused why mmap() is better than writing to a real file.

It isn't, except that it allows to initialise the logfile in
one syscall, without first allocating and zeroing (and hence
dirtying) 16Mb of memory.

Uh, the existing code does not zero 16Mb of memory... it zeroes
8K and then writes that block repeatedly.

See the "one syscall" bit above.

It's possible that the overhead of a syscall for each 8K block is
significant,

I had assumed that the overhead would come from synchronous
metadata incurring writes of at least the inode, block bitmap
and probably an indirect block for each syscall.

but on the other hand writing a block at a time is a heavily used and
heavily optimized path in all Unixen. It's at least as plausible that
the mmap-as-source-of-zeroes path will be slower!

Results:

On Linux/ext2, it appears good for a gain of 3-5% for log
creations (via a fairly minimal test program).

On FreeBSD 4.1-RELEASE/ffs (with all of sync/async/softupdates)
it is a couple of percent worse in elapsed time, but consumes
around a third more system CPU time (12sec vs 9sec on one test
system).

I am awaiting numbers from reiserfs but, for now, it looks like
I am far from vindicated.

Matthew.

#22Tom Lane
tgl@sss.pgh.pa.us
In reply to: Matthew Kirkwood (#21)
Re: mmap for zeroing WAL log

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

I had assumed that the overhead would come from synchronous
metadata incurring writes of at least the inode, block bitmap
and probably an indirect block for each syscall.

No Unix that I've ever heard of forces metadata to disk after each
"write" call; anyone who tried it would have abysmal performance.
That's what fsync and the syncer daemon are for.

regards, tom lane

#23Matthew Kirkwood
matthew@hairy.beasts.org
In reply to: Tom Lane (#22)
Re: mmap for zeroing WAL log

On Tue, 27 Feb 2001, Tom Lane wrote:

Matthew Kirkwood <matthew@hairy.beasts.org> writes:

I had assumed that the overhead would come from synchronous
metadata incurring writes of at least the inode, block bitmap
and probably an indirect block for each syscall.

No Unix that I've ever heard of forces metadata to disk after each
"write" call; anyone who tried it would have abysmal performance.
That's what fsync and the syncer daemon are for.

My understanding was that that's exactly what ffs' synchronous
metadata writes do.

Am I missing something here? Do they jsut schedule I/O, but
return without waiting for its completion?

Matthew.