From 79a42dcc29592d25d84aa2d20e72994d238bf9fe Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 7 Jun 2021 14:51:09 +0000 Subject: [PATCH v1 1/1] wal segment pre-allocation --- doc/src/sgml/config.sgml | 23 ++ src/backend/access/transam/xlog.c | 228 ++++++++++--------- src/backend/bootstrap/bootstrap.c | 8 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/postmaster.c | 44 ++++ src/backend/postmaster/wal_allocator.c | 315 ++++++++++++++++++++++++++ src/backend/replication/basebackup.c | 6 +- src/backend/storage/file/fd.c | 115 ++++++++++ src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/lmgr/proc.c | 1 + src/backend/utils/activity/wait_event.c | 3 + src/backend/utils/init/miscinit.c | 3 + src/backend/utils/misc/guc.c | 12 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/bin/initdb/initdb.c | 1 + src/bin/pg_basebackup/pg_basebackup.c | 19 +- src/bin/pg_basebackup/t/010_pg_basebackup.pl | 4 +- src/bin/pg_resetwal/pg_resetwal.c | 48 ++++ src/include/access/xlog.h | 5 + src/include/miscadmin.h | 3 + src/include/postmaster/wal_allocator.h | 21 ++ src/include/storage/fd.h | 1 + src/include/storage/proc.h | 2 + src/include/utils/wait_event.h | 3 +- 24 files changed, 761 insertions(+), 107 deletions(-) create mode 100644 src/backend/postmaster/wal_allocator.c create mode 100644 src/include/postmaster/wal_allocator.h diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d8c0fd3315..df30d9411f 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3186,6 +3186,29 @@ include_dir 'conf.d' + + wal_allocator_max_size (integer) + + wal_allocator_max_size configuration parameter + + + + + The maximum amount of WAL to pre-allocate for use when a new WAL segment + must be initialized. Setting this parameter to a size less than the WAL + segment size disables this behavior. The default values is 64 megabytes + (64MB). This parameter can only be set in the + postgresql.conf file or on the server command line. + + + + WAL pre-allocation may improve performance in scenarios where new WAL + segments must be created (e.g., during checkpoints when WAL segments + cannot be recycled). + + + + wal_buffers (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 441a9124cd..b8fa237412 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -51,6 +51,7 @@ #include "port/pg_iovec.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" +#include "postmaster/wal_allocator.h" #include "postmaster/walwriter.h" #include "replication/basebackup.h" #include "replication/logical.h" @@ -732,6 +733,13 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * number of pre-allocated WAL segments in pg_wal/preallocated_segments + * + * Protected by WALPreallocationLock. + */ + int num_prealloc_segs; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -3283,11 +3291,10 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; - PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; XLogSegNo max_segno; int fd; - int save_errno; + bool found = false; XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size); @@ -3309,114 +3316,44 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) } /* - * Initialize an empty (all zeroes) segment. NOTE: it is possible that - * another process is doing the same thing. If so, we will end up - * pre-creating an extra log segment. That seems OK, and better than - * holding the lock throughout this lengthy process. + * Try to use a pre-allocated segment, if one exists. If none are + * available, we fall back to creating a new segment on our own. + * + * Note that we still look for a pre-allocated segment even if the pre- + * allocation functionality is disabled via the GUCs. This ensures that any + * pre-allocated segments left over after turning off the pre-allocation + * functionality are eventually used up. */ - elog(DEBUG2, "creating and filling new WAL file"); - - snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); - - unlink(tmppath); - - /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", tmppath))); - - memset(zbuffer.data, 0, XLOG_BLCKSZ); - - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); - save_errno = 0; - if (wal_init_zero) + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + if (XLogCtl->num_prealloc_segs > 0) { - struct iovec iov[PG_IOV_MAX]; - int blocks; - - /* - * Zero-fill the file. With this setting, we do this the hard way to - * ensure that all the file space has really been allocated. On - * platforms that allow "holes" in files, just seeking to the end - * doesn't allocate intermediate space. This way, we know that we - * have all the space and (after the fsync below) that all the - * indirect blocks are down on disk. Therefore, fdatasync(2) or - * O_DSYNC will be sufficient to sync future writes to the log file. - */ - - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = XLOG_BLCKSZ; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / XLOG_BLCKSZ; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } + elog(DEBUG2, "using pre-allocated WAL file"); - i += iovcnt; - } + found = true; + XLogCtl->num_prealloc_segs--; + snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, XLogCtl->num_prealloc_segs); } - else + + if (!found) { /* - * Otherwise, seeking to the end and writing a solitary byte is - * enough. + * We're not using a pre-allocated segment, so there's no need to keep + * holding the WALPreallocationLock. */ - errno = 0; - if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) - { - /* if write didn't set errno, assume no disk space */ - save_errno = errno ? errno : ENOSPC; - } - } - pgstat_report_wait_end(); + LWLockRelease(WALPreallocationLock); - if (save_errno) - { /* - * If we fail to make the file, delete it to release disk space + * Initialize an empty (all zeroes) segment. NOTE: it is possible that + * another process is doing the same thing. If so, we will end up + * pre-creating an extra log segment. That seems OK, and better than + * holding the lock throughout this lengthy process. */ - unlink(tmppath); - - close(fd); + elog(DEBUG2, "creating and filling new WAL file"); - errno = save_errno; - - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", tmppath))); - } - - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); - if (pg_fsync(fd) != 0) - { - int save_errno = errno; - - close(fd); - errno = save_errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", tmppath))); + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + CreateEmptyWalSegment(tmppath); } - pgstat_report_wait_end(); - - if (close(fd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", tmppath))); /* * Now move the segment into place with its final name. @@ -3450,6 +3387,20 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) unlink(tmppath); } + /* + * If we are using a pre-allocated segment, we've been holding onto the + * WALPreallocationLock all this time so that the WAL Allocator process + * can't overwrite the file before we've installed it. + * + * While we're at it, also try to wake up the WAL allocator early so that it + * can get a jump start on allocating new segments if possible. + * + * XXX: Check that this works for all the failure scenarios. + */ + if (found) + LWLockRelease(WALPreallocationLock); + RequestWalPreallocation(); + /* Set flag to tell caller there was no existent file */ *use_existent = false; @@ -3460,7 +3411,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); - elog(DEBUG2, "done creating and filling new WAL file"); + elog(DEBUG2, "done installing new WAL file"); return fd; } @@ -4280,8 +4231,8 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, } /* - * Verify whether pg_wal and pg_wal/archive_status exist. - * If the latter does not exist, recreate it. + * Verify whether pg_wal, pg_wal/archive_status and pg_wal/preallocated_segments + * exist. If either of the latter two do not exist, recreate it. * * It is not the goal of this function to verify the contents of these * directories, but to help in cases where someone has performed a cluster @@ -4324,6 +4275,26 @@ ValidateXLOGDirectoryStructure(void) (errmsg("could not create missing directory \"%s\": %m", path))); } + + /* Check for preallocated_segments */ + snprintf(path, MAXPGPATH, XLOGDIR "/preallocated_segments"); + if (stat(path, &stat_buf) == 0) + { + /* Check for weird cases where it exists but isn't a directory */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + path))); + } + else + { + ereport(LOG, + (errmsg("creating missing WAL directory \"%s\"", path))); + if (MakePGDirectory(path) < 0) + ereport(FATAL, + (errmsg("could not create missing directory \"%s\": %m", + path))); + } } /* @@ -5230,6 +5201,9 @@ XLOGShmemInit(void) SpinLockInit(&XLogCtl->ulsn_lck); InitSharedLatch(&XLogCtl->recoveryWakeupLatch); ConditionVariableInit(&XLogCtl->recoveryNotPausedCV); + + /* protected by WALPreallocationLock */ + XLogCtl->num_prealloc_segs = 0; } /* @@ -12929,3 +12903,55 @@ XLogRequestWalReceiverReply(void) { doRequestWalReceiverReply = true; } + +int +GetNumPreallocatedWalSegs(void) +{ + int ret; + + LWLockAcquire(WALPreallocationLock, LW_SHARED); + ret = XLogCtl->num_prealloc_segs; + LWLockRelease(WALPreallocationLock); + + return ret; +} + +void +SetNumPreallocatedWalSegs(int i) +{ + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + XLogCtl->num_prealloc_segs = i; + LWLockRelease(WALPreallocationLock); +} + +/* + * InstallPreallocatedWalSeg + * + * Renames the file at "path" to the next open pre-allocated segment slot and + * bumps up XLogCtl->num_prealloc_segs. + */ +void +InstallPreallocatedWalSeg(const char *path) +{ + char newpath[MAXPGPATH]; + + LWLockAcquire(WALPreallocationLock, LW_EXCLUSIVE); + + snprintf(newpath, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, XLogCtl->num_prealloc_segs); + (void) durable_rename(path, newpath, ERROR); + XLogCtl->num_prealloc_segs++; + + LWLockRelease(WALPreallocationLock); +} + +/* + * Request that the WAL allocator wake up early. + */ +void +RequestWalPreallocation(void) +{ + if (ProcGlobal->walAllocatorLatch && + WalPreallocationEnabled()) + SetLatch(ProcGlobal->walAllocatorLatch); +} diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 94ab5ca095..56132337b0 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -36,6 +36,7 @@ #include "pgstat.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" +#include "postmaster/wal_allocator.h" #include "postmaster/walwriter.h" #include "replication/walreceiver.h" #include "storage/bufmgr.h" @@ -333,6 +334,9 @@ AuxiliaryProcessMain(int argc, char *argv[]) case WalReceiverProcess: MyBackendType = B_WAL_RECEIVER; break; + case WalAllocatorProcess: + MyBackendType = B_WAL_ALLOCATOR; + break; default: MyBackendType = B_INVALID; } @@ -468,6 +472,10 @@ AuxiliaryProcessMain(int argc, char *argv[]) WalReceiverMain(); proc_exit(1); + case WalAllocatorProcess: + WalAllocatorMain(); + proc_exit(1); + default: elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); proc_exit(1); diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index bfdf6a833d..4bac1e640f 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -24,6 +24,7 @@ OBJS = \ postmaster.o \ startup.o \ syslogger.o \ + wal_allocator.o \ walwriter.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5a050898fe..d67de4b4a9 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -251,6 +251,7 @@ static pid_t StartupPID = 0, CheckpointerPID = 0, WalWriterPID = 0, WalReceiverPID = 0, + WalAllocatorPID = 0, AutoVacPID = 0, PgArchPID = 0, PgStatPID = 0, @@ -557,6 +558,7 @@ static void ShmemBackendArrayRemove(Backend *bn); #define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) +#define StartWalAllocator() StartChildProcess(WalAllocatorProcess) /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) @@ -1781,6 +1783,12 @@ ServerLoop(void) if (WalWriterPID == 0 && pmState == PM_RUN) WalWriterPID = StartWalWriter(); + /* + * If we have lost the WAL allocator process, try to start a new one. + */ + if (WalAllocatorPID == 0 && pmState == PM_RUN) + WalAllocatorPID = StartWalAllocator(); + /* * If we have lost the autovacuum launcher, try to start a new one. We * don't want autovacuum to run in binary upgrade mode because @@ -2696,6 +2704,8 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(WalWriterPID, SIGHUP); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGHUP); + if (WalAllocatorPID != 0) + signal_child(WalAllocatorPID, SIGHUP); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGHUP); if (PgArchPID != 0) @@ -3023,6 +3033,8 @@ reaper(SIGNAL_ARGS) BgWriterPID = StartBackgroundWriter(); if (WalWriterPID == 0) WalWriterPID = StartWalWriter(); + if (WalAllocatorPID == 0) + WalAllocatorPID = StartWalAllocator(); /* * Likewise, start other special children as needed. In a restart @@ -3150,6 +3162,20 @@ reaper(SIGNAL_ARGS) continue; } + /* + * Was it the WAL allocator? Normal exit can be ignored; we'll start a + * new one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == WalAllocatorPID) + { + WalAllocatorPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL Allocator process")); + continue; + } + /* * Was it the autovacuum launcher? Normal exit can be ignored; we'll * start a new one at the next iteration of the postmaster's main @@ -3623,6 +3649,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT)); } + /* Take care of the WAL allocator too */ + if (pid == WalAllocatorPID) + WalAllocatorPID = 0; + else if (WalAllocatorPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) WalAllocatorPID))); + signal_child(WalAllocatorPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the autovacuum launcher too */ if (pid == AutoVacPID) AutoVacPID = 0; @@ -3810,6 +3848,8 @@ PostmasterStateMachine(void) signal_child(StartupPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); + if (WalAllocatorPID != 0) + signal_child(WalAllocatorPID, SIGTERM); /* checkpointer, archiver, stats, and syslogger may continue for now */ /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ @@ -3840,6 +3880,7 @@ PostmasterStateMachine(void) (CheckpointerPID == 0 || (!FatalError && Shutdown < ImmediateShutdown)) && WalWriterPID == 0 && + WalAllocatorPID == 0 && AutoVacPID == 0) { if (Shutdown >= ImmediateShutdown || FatalError) @@ -3933,6 +3974,7 @@ PostmasterStateMachine(void) Assert(BgWriterPID == 0); Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); + Assert(WalAllocatorPID == 0); Assert(AutoVacPID == 0); Assert(PgArchPID == 0); /* syslogger is not considered here */ @@ -4142,6 +4184,8 @@ TerminateChildren(int signal) signal_child(WalWriterPID, signal); if (WalReceiverPID != 0) signal_child(WalReceiverPID, signal); + if (WalAllocatorPID != 0) + signal_child(WalAllocatorPID, signal); if (AutoVacPID != 0) signal_child(AutoVacPID, signal); if (PgArchPID != 0) diff --git a/src/backend/postmaster/wal_allocator.c b/src/backend/postmaster/wal_allocator.c new file mode 100644 index 0000000000..38cebf7953 --- /dev/null +++ b/src/backend/postmaster/wal_allocator.c @@ -0,0 +1,315 @@ +/*------------------------------------------------------------------------- + * + * wal_allocator.c + * + * The WAL allocator is new as of Postgres 15. It attempts to keep regular + * backends from having to allocate new WAL segments. Even when + * wal_recycle is enabled, pre-allocating WAL segments can yield + * significant performance improvements in certain scenarios. Note that + * regular backends are still empowered to create new WAL segments if the + * WAL allocator fails to generate enough new segments. + * + * The WAL allocator is controlled by one parameter: wal_allocator_max_size. + * wal_allocator_max_size specifies the maximum amount of WAL to pre- + * allocate. If this value is not divisible by the WAL segment size, fewer + * WAL segments will be pre-allocated. + * + * The WAL allocator is started by the postmaster as soon as the startup + * subprocess finishes, or as soon as recovery begins if we are doing + * archive recovery. It remains alive until the postmaster commands it to + * terminate. Normal termination is by SIGTERM, which instructs the + * WAL allocator to exit(0). Emergency termination is by SIGQUIT; like any + * backend, the WAL allocator will simply abort and exit on SIGQUIT. + * + * If the WAL allocator exits unexpectedly, the postmaster treats that the + * same as a backend crash: shared memory may be corrupted, so remaining + * backends should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/wal_allocator.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "libpq/pqsignal.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/wal_allocator.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#define WAL_ALLOC_TIMEOUT_S (60) + +/* + * GUC parameters + */ +int wal_alloc_max_size_mb = 64; + +static void DoWalPreAllocation(void); +static void ScanForExistingPreallocatedSegments(void); + +/* + * Main entry point for WAL allocator process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +WalAllocatorMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext wal_alloc_context; + + /* + * Properly accept or ignore signals that might be sent to us. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. + */ + wal_alloc_context = AllocSetContextCreate(TopMemoryContext, + "WAL Allocator", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(wal_alloc_context); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about. + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + AbortBufferIO(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(wal_alloc_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(wal_alloc_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + + /* Report wait end here, when there is no further possibility of wait */ + pgstat_report_wait_end(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Advertise our latch that backends can use to wake us up while we're + * sleeping. + */ + ProcGlobal->walAllocatorLatch = &MyProc->procLatch; + + /* + * Before we go into the main loop, scan the pre-allocated segments + * directory and look for anything that was left over from the last time. + */ + ScanForExistingPreallocatedSegments(); + + /* + * Loop forever + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + HandleMainLoopInterrupts(); + + DoWalPreAllocation(); + + /* XXX: Send activity statistics to the stats collector */ + + /* Sleep until we are signaled or WAL_ALLOC_TIMEOUT_S has elapsed. */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + WAL_ALLOC_TIMEOUT_S * 1000L /* convert to ms */, + WAIT_EVENT_WAL_ALLOCATOR_MAIN); + } + + pg_unreachable(); +} + +/* + * DoWalPreAllocation + * + * Tries to allocate up to wal_allocator_max_size worth of WAL. + */ +static void +DoWalPreAllocation(void) +{ + int max_prealloc_segs; + + max_prealloc_segs = wal_alloc_max_size_mb / (wal_segment_size / (1024 * 1024)); + + while (!ShutdownRequestPending && + GetNumPreallocatedWalSegs() < max_prealloc_segs) + { + char tmppath[MAXPGPATH]; + + snprintf(tmppath, MAXPGPATH, "%s/preallocated_segments/xlogtemp", XLOGDIR); + CreateEmptyWalSegment(tmppath); + InstallPreallocatedWalSeg(tmppath); + elog(DEBUG2, "pre-allocated WAL segment"); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + max_prealloc_segs = wal_alloc_max_size_mb / (wal_segment_size / (1024 * 1024)); + } + } +} + +/* + * ScanForExistingPreallocatedSegments + * + * This function searches through pg_wal/preallocated_segments for any segments + * that were left over from a previous WAL allocator process and sets the + * tracking variable in shared memory accordingly. + */ +static void +ScanForExistingPreallocatedSegments(void) +{ + int i = 0; + + /* + * fsync the preallocated_segments directory in case any renames have yet to + * be flushed to disk. This is probably not really necessary, but it seems + * nice to know that all the segments we find are really where we think they + * are. + */ + fsync_fname_ext(XLOGDIR "/preallocated_segments", true, false, FATAL); + + /* + * Gather all the preallocated segments we can find. + */ + while (true) + { + FILE *fd; + char path[MAXPGPATH]; + + snprintf(path, MAXPGPATH, "%s/preallocated_segments/xlogtemp.%d", + XLOGDIR, i); + + fd = AllocateFile(path, "r"); + if (fd != NULL) + { + FreeFile(fd); + i++; + } + else + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + break; + } + } + + SetNumPreallocatedWalSegs(i); + elog(DEBUG2, "found %d preallocated segments during startup", i); +} + +bool +WalPreallocationEnabled(void) +{ + return wal_alloc_max_size_mb >= wal_segment_size / (1024 * 1024); +} diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index e09108d0ec..c9a91dc938 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1399,11 +1399,13 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); /* - * Also send archive_status directory (by hackishly reusing - * statbuf from above ...). + * Also send archive_status and preallocated_segments (by hackishly + * reusing statbuf from above ...). */ size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf, sizeonly); + size += _tarWriteHeader("./pg_wal/preallocated_segments", NULL, + &statbuf, sizeonly); continue; /* don't recurse into pg_wal */ } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index e8cd7ef088..705c360a27 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #ifndef WIN32 #include @@ -3779,3 +3780,117 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) return sum; } + +/* + * CreateEmptyWalSegment + * + * Create a new file that can be used as a new WAL segment. The caller is + * responsible for installing the new file in pg_wal. + */ +void +CreateEmptyWalSegment(const char *path) +{ + PGAlignedXLogBlock zbuffer; + int fd; + int save_errno; + + unlink(path); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); + save_errno = 0; + if (wal_init_zero) + { + struct iovec iov[PG_IOV_MAX]; + int blocks; + + /* + * Zero-fill the file. With this setting, we do this the hard way to + * ensure that all the file space has really been allocated. On + * platforms that allow "holes" in files, just seeking to the end + * doesn't allocate intermediate space. This way, we know that we + * have all the space and (after the fsync below) that all the + * indirect blocks are down on disk. Therefore, fdatasync(2) or + * O_DSYNC will be sufficient to sync future writes to the log file. + */ + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (int i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = XLOG_BLCKSZ; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = wal_segment_size / XLOG_BLCKSZ; + for (int i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * XLOG_BLCKSZ; + + if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) + { + save_errno = errno; + break; + } + + i += iovcnt; + } + } + else + { + /* + * Otherwise, seeking to the end and writing a solitary byte is + * enough. + */ + errno = 0; + if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) + { + /* if write didn't set errno, assume no disk space */ + save_errno = errno ? errno : ENOSPC; + } + } + pgstat_report_wait_end(); + + if (save_errno) + { + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(path); + + close(fd); + + errno = save_errno; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c295..212bda8e7d 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +WALPreallocationLock 48 diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 2575ea1ca0..555f4ef53e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -181,6 +181,7 @@ InitProcGlobal(void) ProcGlobal->startupProcPid = 0; ProcGlobal->startupBufferPinWaitBufId = -1; ProcGlobal->walwriterLatch = NULL; + ProcGlobal->walAllocatorLatch = NULL; ProcGlobal->checkpointerLatch = NULL; pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO); pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO); diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 6baf67740c..bb598b5154 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -248,6 +248,9 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_WAL_WRITER_MAIN: event_name = "WalWriterMain"; break; + case WAIT_EVENT_WAL_ALLOCATOR_MAIN: + event_name = "WalAllocatorMain"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 8b73850d0d..b671dc6fac 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -268,6 +268,9 @@ GetBackendTypeDesc(BackendType backendType) case B_WAL_WRITER: backendDesc = "walwriter"; break; + case B_WAL_ALLOCATOR: + backendDesc = "wal allocator"; + break; case B_ARCHIVER: backendDesc = "archiver"; break; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 68b62d523d..5e2dbd7c56 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -72,6 +72,7 @@ #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" +#include "postmaster/wal_allocator.h" #include "postmaster/walwriter.h" #include "replication/logicallauncher.h" #include "replication/reorderbuffer.h" @@ -2809,6 +2810,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_allocator_max_size", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("Sets the maximum amount of WAL to pre-allocate."), + NULL, + GUC_UNIT_MB + }, + &wal_alloc_max_size_mb, + 64, 0, 102400, + NULL, NULL, NULL + }, + { {"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS, gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ddbb6dc2be..a1250cf67c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -221,6 +221,7 @@ #wal_compression = off # enable compression of full-page writes #wal_init_zero = on # zero-fill new WAL files #wal_recycle = on # recycle WAL files +#wal_allocator_max_size = 64MB # amount of WAL to pre-allocate #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 152d21e88b..d24c9cbc38 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -205,6 +205,7 @@ static const char *backend_options = "--single -F -O -j -c search_path=pg_catalo static const char *const subdirs[] = { "global", "pg_wal/archive_status", + "pg_wal/preallocated_segments", "pg_commit_ts", "pg_dynshmem", "pg_notify", diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 16d8929b23..a81d92f8de 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -678,6 +678,22 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier) pg_log_error("could not create directory \"%s\": %m", statusdir); exit(1); } + + /* + * Also create pg_wal/preallocated_segments if necessary. + */ + if (PQserverVersion(conn) >= 150000) + { + char prealloc_dir[MAXPGPATH]; + + snprintf(prealloc_dir, sizeof(prealloc_dir), "%s/pg_wal/preallocated_segments", + basedir); + if (pg_mkdir_p(prealloc_dir, pg_dir_create_mode) != 0 && errno != EEXIST) + { + pg_log_error("could not create directory \"%s\": %m", prealloc_dir); + exit(1); + } + } } /* @@ -1614,7 +1630,8 @@ ReceiveTarAndUnpackCopyChunk(size_t r, char *copybuf, void *callback_data) */ if (!((pg_str_endswith(state->filename, "/pg_wal") || pg_str_endswith(state->filename, "/pg_xlog") || - pg_str_endswith(state->filename, "/archive_status")) && + pg_str_endswith(state->filename, "/archive_status") || + pg_str_endswith(state->filename, "/preallocated_segments")) && errno == EEXIST)) { pg_log_error("could not create directory \"%s\": %m", diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index 74f8c2c739..61ceda2c6e 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -120,10 +120,10 @@ SKIP: "check backup dir permissions"); } -# Only archive_status directory should be copied in pg_wal/. +# Only archive_status and preallocated_segments directories should be copied in pg_wal/. is_deeply( [ sort(slurp_dir("$tempdir/backup/pg_wal/")) ], - [ sort qw(. .. archive_status) ], + [ sort qw(. .. archive_status preallocated_segments) ], 'no WAL files copied'); # Contents of these directories should not be copied. diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 805dafef07..f5a4d77064 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -84,6 +84,7 @@ static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); +static void KillExistingPreallocatedSegments(void); static void WriteEmptyXLOG(void); static void usage(void); @@ -513,6 +514,7 @@ main(int argc, char *argv[]) RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); + KillExistingPreallocatedSegments(); WriteEmptyXLOG(); printf(_("Write-ahead log reset\n")); @@ -1053,6 +1055,52 @@ KillExistingXLOG(void) } +/* + * Remove existing preallocated segments + */ +static void +KillExistingPreallocatedSegments(void) +{ +#define PREALLOCSEGDIR XLOGDIR "/preallocated_segments" + + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(PREALLOCSEGDIR)]; + + xldir = opendir(PREALLOCSEGDIR); + if (xldir == NULL) + { + pg_log_error("could not open directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + if (strncmp(xlde->d_name, "xlogtemp", strlen("xlogtemp")) == 0) + { + snprintf(path, sizeof(path), "%s/%s", PREALLOCSEGDIR, xlde->d_name); + if (unlink(path) < 0) + { + pg_log_error("could not delete file \"%s\": %m", path); + exit(1); + } + } + } + + if (errno) + { + pg_log_error("could not read directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } + + if (closedir(xldir)) + { + pg_log_error("could not close directory \"%s\": %m", PREALLOCSEGDIR); + exit(1); + } +} + + /* * Remove existing archive status files */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 77187c12be..608ec4e74b 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -361,6 +361,11 @@ extern void XLogRequestWalReceiverReply(void); extern void assign_max_wal_size(int newval, void *extra); extern void assign_checkpoint_completion_target(double newval, void *extra); +extern int GetNumPreallocatedWalSegs(void); +extern void InstallPreallocatedWalSeg(const char *path); +extern void SetNumPreallocatedWalSegs(int i); +extern void RequestWalPreallocation(void); + /* * Routines to start, stop, and get status of a base backup. */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 4dc343cbc5..a4599df7ff 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -333,6 +333,7 @@ typedef enum BackendType B_WAL_RECEIVER, B_WAL_SENDER, B_WAL_WRITER, + B_WAL_ALLOCATOR, B_ARCHIVER, B_STATS_COLLECTOR, B_LOGGER, @@ -435,6 +436,7 @@ typedef enum CheckpointerProcess, WalWriterProcess, WalReceiverProcess, + WalAllocatorProcess, NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; @@ -448,6 +450,7 @@ extern AuxProcType MyAuxProcType; #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess) #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess) #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess) +#define AmWalAllocatorProcess() (MyAuxProcType == WalAllocatorProcess) /***************************************************************************** diff --git a/src/include/postmaster/wal_allocator.h b/src/include/postmaster/wal_allocator.h new file mode 100644 index 0000000000..c7e139226b --- /dev/null +++ b/src/include/postmaster/wal_allocator.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * wal_allocator.h + * Exports from postmaster/wal_allocator.c. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * src/include/postmaster/wal_allocator.h + * + *------------------------------------------------------------------------- + */ +#ifndef _WAL_ALLOCATOR_H +#define _WAL_ALLOCATOR_H + +/* GUC options */ +extern int wal_alloc_max_size_mb; + +extern void WalAllocatorMain(void) pg_attribute_noreturn(); +extern bool WalPreallocationEnabled(void); + +#endif /* _WAL_ALLOCATOR_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 5b3c280dd7..008a54f2ee 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -173,6 +173,7 @@ extern int durable_unlink(const char *fname, int loglevel); extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); +extern void CreateEmptyWalSegment(const char *path); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index be67d8a861..9b3acfb68e 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -348,6 +348,8 @@ typedef struct PROC_HDR pg_atomic_uint32 clogGroupFirst; /* WALWriter process's latch */ Latch *walwriterLatch; + /* WAL Allocator process's latch */ + Latch *walAllocatorLatch; /* Checkpointer process's latch */ Latch *checkpointerLatch; /* Current shared estimate of appropriate spins_per_delay value */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 6c6ec2e711..36324b16c7 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -47,7 +47,8 @@ typedef enum WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, WAIT_EVENT_WAL_SENDER_MAIN, - WAIT_EVENT_WAL_WRITER_MAIN + WAIT_EVENT_WAL_WRITER_MAIN, + WAIT_EVENT_WAL_ALLOCATOR_MAIN } WaitEventActivity; /* ---------- -- 2.16.6