*** a/configure --- b/configure *************** *** 850,855 **** with_gnu_ld --- 850,856 ---- enable_largefile enable_float4_byval enable_float8_byval + enable_float8_byval ' ac_precious_vars='build_alias host_alias *************** *** 860,865 **** LDFLAGS --- 861,867 ---- LIBS CPPFLAGS CPP + CPPFLAGS LDFLAGS_EX LDFLAGS_SL DOCBOOKSTYLE' *************** *** 28245,28254 **** fi if test "$PORTNAME" != "win32"; then cat >>confdefs.h <<\_ACEOF ! #define USE_SYSV_SHARED_MEMORY 1 _ACEOF ! SHMEM_IMPLEMENTATION="src/backend/port/sysv_shmem.c" else cat >>confdefs.h <<\_ACEOF --- 28247,28256 ---- if test "$PORTNAME" != "win32"; then cat >>confdefs.h <<\_ACEOF ! #define USE_POSIX_SHARED_MEMORY 1 _ACEOF ! SHMEM_IMPLEMENTATION="src/backend/port/posix_shmem.c" else cat >>confdefs.h <<\_ACEOF *** a/configure.in --- b/configure.in *************** *** 1730,1737 **** fi # Select shared-memory implementation type. if test "$PORTNAME" != "win32"; then ! AC_DEFINE(USE_SYSV_SHARED_MEMORY, 1, [Define to select SysV-style shared memory.]) ! SHMEM_IMPLEMENTATION="src/backend/port/sysv_shmem.c" else AC_DEFINE(USE_WIN32_SHARED_MEMORY, 1, [Define to select Win32-style shared memory.]) SHMEM_IMPLEMENTATION="src/backend/port/win32_shmem.c" --- 1730,1737 ---- # Select shared-memory implementation type. if test "$PORTNAME" != "win32"; then ! AC_DEFINE(USE_POSIX_SHARED_MEMORY, 1, [Define to select SysV-style shared memory.]) ! SHMEM_IMPLEMENTATION="src/backend/port/posix_shmem.c" else AC_DEFINE(USE_WIN32_SHARED_MEMORY, 1, [Define to select Win32-style shared memory.]) SHMEM_IMPLEMENTATION="src/backend/port/win32_shmem.c" *** a/src/backend/bootstrap/bootstrap.c --- b/src/backend/bootstrap/bootstrap.c *************** *** 352,358 **** AuxiliaryProcessMain(int argc, char *argv[]) /* If standalone, create lockfile for data directory */ if (!IsUnderPostmaster) ! CreateDataDirLockFile(false); SetProcessingMode(BootstrapProcessing); IgnoreSystemIndexes = true; --- 352,361 ---- /* If standalone, create lockfile for data directory */ if (!IsUnderPostmaster) ! CreateDataDirLockFile(false,false); ! ! /* Hold on to the lock file for the life of this process. */ ! AcquireDataDirLock(); SetProcessingMode(BootstrapProcessing); IgnoreSystemIndexes = true; *** /dev/null --- b/src/backend/port/posix_shmem.c *************** *** 0 **** --- 1,469 ---- + /*------------------------------------------------------------------------- + * + * posix_shmem.c + * Implement shared memory using POSIX facilities + * + * These routines represent a fairly thin layer on top of POSIX shared + * memory functionality. + * + * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include + #include + #include + #include + #include + #include + #include + #ifdef HAVE_KERNEL_OS_H + #include + #endif + + #include "miscadmin.h" + #include "libpq/md5.h" + #include "storage/ipc.h" + #include "storage/pg_shmem.h" + + + #define IPCProtection (0600) /* access/modify by user only */ + #define IPCNameLength 31 /* Darwin requires max 30 + '\0' */ + + uint8 UsedShmemInstanceId = 0; + void *UsedShmemSegAddr = NULL; + + static void GenerateIPCName(uint8 instanceId, char destIPCName[IPCNameLength]); + static void *InternalIpcMemoryCreate(const char ipcName[IPCNameLength],uint8 instanceId, Size size); + static void IpcMemoryDetach(int status, Datum shmaddr); + static int POSIXSharedMemoryFD=-1; + + + /* + * GenerateIPCName(instanceId, destIPCName) + * + * Generate a shared memory object key name using the implicit argument + * DataDir's pathname and the current instance id. A hash of the + * canonicalized directory path is used to construct the key name. + * Store the result in destIPCName, which must be IPCNameLength bytes. + */ + static void + GenerateIPCName(uint8 instanceId, char destIPCName[IPCNameLength]) + { + + /* This must be 30 characters or less for portability (i.e. Darwin). + * POSIX requires shared memory names to begin with a single slash. It + * should not have any others slashes or any non-alphanumerics as the + * that is the broadest assumption of what is permitted in a filename. + * Also, case sensitivity should not be presumed. + * + * Collisions are averted by the fact that the shared memory region is + * immediately unlinked. + * + * The string is formed starting with a slash, then the identifier 'PG.', + * then the pid of the current process. + */ + snprintf(destIPCName, IPCNameLength, "/PG.%6ld", (long int)getpid()); + } + + /* + * InternalIpcMemoryCreate(ipcName, size) + * + * Attempt to create a new shared memory segment with the specified IPC name. + * Will fail (return NULL) if such a segment already exists. If successful, + * attach the segment to the current process and return its attached address. + * On success, callbacks are registered with on_shmem_exit to detach and + * delete the segment when on_shmem_exit is called. + * + * If we fail with a failure code other than collision-with-existing-segment, + * print out an error and abort. Other types of errors are not recoverable. + */ + static void * + InternalIpcMemoryCreate(const char ipcName[IPCNameLength], uint8 instanceId, Size size) + { + int fd; + int unlink_status=0; + int fstat_status=0; + int ftruncate_status=0; + void *shmaddr; + struct stat statbuf; + + fd = shm_open(ipcName, O_RDWR | O_CREAT | O_EXCL, IPCProtection); + + if (fd < 0) + { + /* + * Fail quietly if error indicates a collision with existing segment. + * One would expect EEXIST, given that we said O_EXCL. + */ + if (errno == EEXIST || errno == EACCES || errno == EINTR) + return NULL; + + /* + * Else complain and abort + */ + ereport(FATAL, + (errmsg("could not create shared memory segment: %m"), + errdetail("Failed system call was shm_open(name=%s, oflag=%lu, mode=%lu).", + ipcName, (unsigned long) O_CREAT | O_EXCL, + (unsigned long) IPCProtection), + (errno == EMFILE) ? + errhint("This error means that the process has reached its limit " + "for open file descriptors.") : 0, + (errno == ENOSPC) ? + errhint("This error means the process has ran out of address " + "space.") : 0)); + } + /* the race between creation and unlinking is protected by the shared memory pid file */ + + + unlink_status = shm_unlink(ipcName); + if(unlink_status<0) + { + /* It would be virtually impossible for us to fail to unlink a shared memory region we just created, but we need to handle this anyway- refuse to use this shared memory segment. */ + ereport(FATAL, + (errmsg("could not unlink shared memory segment : %m"), + errdetail("Failed system call was shm_unlink(name=%s).",ipcName))); + return NULL; + } + + /* Increase the size of the file descriptor to the desired length. + * If this fails so will mmap since it can't map size bytes. */ + fstat_status = fstat(fd, &statbuf); + if(fstat_status<0) + { + ereport(FATAL, + (errmsg("could not fstat the shared memory segment : %m"), + errdetail("Failed system call was fstat(fd=%d,stat=%p).",fd,&statbuf))); + return NULL; + } + if (statbuf.st_size < size) + { + ftruncate_status = ftruncate(fd, size); + if(ftruncate_status<0) + { + ereport(FATAL, + (errmsg("could not set the proper shared memory segment size : %m"), + errdetail("Failed system call was ftruncate(fd=%d,size=%lu).",fd,size))); + return NULL; + } + } + + /* OK, should be able to attach to the segment */ + shmaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (shmaddr == MAP_FAILED) + elog(FATAL, "mmap with size=%ul and fd=%d failed: %m", (unsigned int) size, fd); + + /* Register on-exit routine to detach new segment before deleting */ + on_shmem_exit(IpcMemoryDetach, PointerGetDatum(shmaddr)); + + POSIXSharedMemoryFD = fd; + return shmaddr; + } + + /****************************************************************************/ + /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ + /* from process' address space */ + /* (called as an on_shmem_exit callback, hence funny argument list) */ + /****************************************************************************/ + static void + IpcMemoryDetach(int status, Datum shmaddr) + { + PGShmemHeader *hdr; + hdr = (PGShmemHeader *) DatumGetPointer(shmaddr); + + if (munmap(DatumGetPointer(shmaddr), hdr->totalsize) < 0) + elog(LOG, "munmap(%p, ...) failed: %m", DatumGetPointer(shmaddr)); + } + + /* + * PGSharedMemoryIsInUse + * + * Is a previously-existing shmem segment still existing and in use? + * + * The point of this exercise is to detect the case where a prior postmaster + * crashed, but it left child backends that are still running. Therefore + * we only care about shmem segments that are associated with the intended + * DataDir. This is an important consideration since accidental matches of + * shmem segment IDs are reasonably common. + */ + bool + PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) + { + char ipcName[IPCNameLength]; + PGShmemHeader *hdr; + int fd, isValidHeader; + + #ifndef WIN32 + struct stat statbuf; + #endif + + /* + * We detect whether a shared memory segment is in use by seeing whether + * we can open it. If so, + */ + GenerateIPCName((uint8) id1, ipcName); + fd = shm_open(ipcName, O_RDWR, 0); + if (fd < 0) + { + /* + * ENOENT means the segment no longer exists. + */ + if (errno == ENOENT) + return false; + + /* + * EACCES implies that the segment belongs to some other userid, which + * means that there is an different account with the same database open. + */ + if (errno == EACCES) + return true; + } + + /* + * Try to attach to the segment and see if it matches our data directory, + * just as a sanity check. Note that this is not absolutely necessary + * since the data directory is encoded in the IPC shared memory key name. + * + * On Windows, which doesn't have useful inode numbers, we can't do this + * so we punt and assume that the shared memory is valid (which in all + * likelihood it is). + */ + #ifdef WIN32 + close(fd); + return true; + #else + if (stat(DataDir, &statbuf) < 0) + { + close(fd); + return true; /* if can't stat, be conservative */ + } + + hdr = (PGShmemHeader *) mmap(NULL, sizeof(PGShmemHeader), PROT_READ, MAP_SHARED, fd, 0); + close(fd); + + if (hdr == (PGShmemHeader *) -1) + return true; /* if can't attach, be conservative */ + + isValidHeader = hdr->magic == PGShmemMagic && + hdr->device == statbuf.st_dev && + hdr->inode == statbuf.st_ino; + munmap((void *) hdr, sizeof(PGShmemHeader)); + + /* + * If true, it's either not a Postgres segment, or not one for my data + * directory. In either case it poses no threat. + * If false, trouble -- looks a lot like there are still live backends + */ + + return isValidHeader; + #endif + } + + + /* + * PGSharedMemoryCreate + * + * Create a shared memory segment of the given size and initialize its + * standard header. Also, register an on_shmem_exit callback to release + * the storage. + * + * Dead Postgres segments are released when found, but we do not fail upon + * collision with non-Postgres shmem segments, although this is astronomically + * unlikely. + * + * makePrivate means to always create a new segment, rather than attach to + * or recycle any existing segment. Currently, this value is ignored as + * all segments are newly created (the dead ones are simply released). + * + * Port is ignored. (It is leftover from the SysV shared memory routines.) + */ + PGShmemHeader * + PGSharedMemoryCreate(Size size, bool makePrivate, int port) + { + uint8 instanceId; + void *shmaddr; + PGShmemHeader *hdr; + char ipcName[IPCNameLength]; + + #ifndef WIN32 + struct stat statbuf; + #endif + + /* Room for a header? */ + Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + + /* Make sure PGSharedMemoryAttach doesn't fail without need */ + UsedShmemSegAddr = NULL; + + /* Loop till we find a free IPC key */ + for (instanceId = 0; true; instanceId++) + { + /* + * Try to create new segment. InternalIpcMemoryCreate encodes the data + * directory path name into the IPC key name, so if this fails + * one of three things has happened: + * 1) there is another postmaster still running with the same data directory + * 2) the postmaster in this directory crashed or was kill -9'd + * and there are backends still running. + * 3) the postmaster in this directory crashed or was kill -9'd and there + * are no backends still running, just an orpaned shmem segment + * + * Case 1 is handled by the postmaster.pid file and doesn't concern us here. + * For case 2 & 3 we now should unlink the shmem segment so that it is + * cleaned up, either now (case 3) or when the backends terminate (case 2). + * Then we should try the next instanceId to create a new segment so this + * process can be up and running quickly. + */ + GenerateIPCName(instanceId, ipcName); + shmaddr = InternalIpcMemoryCreate(ipcName, instanceId, size); + if (shmaddr) + break; /* successful create and attach */ + + /* + * The segment appears to be from a dead Postgres process, or from a + * previous cycle of life in this same process. Zap it, if possible. + * This shouldn't fail, but if it does, assume the segment + * belongs to someone else after all, and continue quietly. + */ + shm_unlink(ipcName); + } + + /* OK, we created a new segment. Mark it as created by this process. */ + hdr = (PGShmemHeader *) shmaddr; + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + + #ifndef WIN32 + /* Fill in the data directory ID info, too */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + hdr->device = statbuf.st_dev; + hdr->inode = statbuf.st_ino; + #endif + + /* Initialize space allocation status for segment. */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + + /* Save info for possible future use */ + UsedShmemInstanceId = instanceId; + UsedShmemSegAddr = shmaddr; + + return hdr; + } + + #ifdef EXEC_BACKEND + + /* + * PGSharedMemoryReAttach + * + * Re-attach to an already existing shared memory segment. In the non + * EXEC_BACKEND case this is not used, because postmaster children inherit + * the shared memory segment attachment via fork(). + * + * UsedShmemInstanceId and UsedShmemSegAddr are implicit parameters to this + * routine. The caller must have already restored them to the postmaster's + * values. + */ + void + PGSharedMemoryReAttach(void) + { + int fd; + void *hdr; + void *origUsedShmemSegAddr = UsedShmemSegAddr; + + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + + #ifdef __CYGWIN__ + /* cygipc (currently) appears to not detach on exec. */ + PGSharedMemoryDetach(); + UsedShmemSegAddr = origUsedShmemSegAddr; + #endif + + elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); + hdr = (void *) PGSharedMemoryAttach(UsedShmemInstanceId); + if (hdr == NULL) + elog(FATAL, "could not reattach to shared memory (instanceId=%d, addr=%p): %m", + (int) UsedShmemInstanceId, UsedShmemSegAddr); + if (hdr != origUsedShmemSegAddr) + elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", + hdr, origUsedShmemSegAddr); + + UsedShmemSegAddr = hdr; /* probably redundant */ + } + + + /* + * Attach to shared memory and make sure it has a Postgres header + * + * Returns attach address if OK, else NULL + */ + static PGShmemHeader * + PGSharedMemoryAttach(uint8 instanceId) + { + PGShmemHeader *hdr; + char ipcName[IPCNameLength]; + Size size; + int fd; + + fd = POSIXSharedMemoryFD; + + if (fd < 0) + return NULL; + + hdr = (PGShmemHeader *) mmap(UsedShmemSegAddr, sizeof(PGShmemHeader), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (hdr == MMAP_FAILED) + { + return NULL; /* failed to mmap- unlikely */ + } + + if (hdr->magic != PGShmemMagic) + { + munmap((void *) hdr, sizeof(PGShmemHeader)); + return NULL; /* segment belongs to a non-Postgres app */ + } + + /* Since the segment has a valid Postgres header, unmap and re-map it with the proper size */ + size = hdr->totalsize; + munmap((void *) hdr, sizeof(PGShmemHeader)); + hdr = (PGShmemHeader *) mmap(UsedShmemSegAddr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (hdr == MMAP_FAILED) /* this shouldn't happen */ + return NULL; + + return hdr; + } + #endif /* EXEC_BACKEND */ + + /* + * PGSharedMemoryDetach + * + * Detach from the shared memory segment, if still attached. This is not + * intended for use by the process that originally created the segment + * (it will have an on_shmem_exit callback registered to do that). Rather, + * this is for subprocesses that have inherited an attachment and want to + * get rid of it. + */ + void + PGSharedMemoryDetach(void) + { + PGShmemHeader *hdr; + if (UsedShmemSegAddr != NULL) + { + hdr = (PGShmemHeader *) UsedShmemSegAddr; + if (munmap(UsedShmemSegAddr, hdr->totalsize) < 0) + elog(LOG, "munmap(%p) failed: %m", UsedShmemSegAddr); + UsedShmemSegAddr = NULL; + } + } *** a/src/backend/port/sysv_shmem.c --- /dev/null *************** *** 1,550 **** - /*------------------------------------------------------------------------- - * - * sysv_shmem.c - * Implement shared memory using SysV facilities - * - * These routines represent a fairly thin layer on top of SysV shared - * memory functionality. - * - * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/backend/port/sysv_shmem.c - * - *------------------------------------------------------------------------- - */ - #include "postgres.h" - - #include - #include - #include - #include - #ifdef HAVE_SYS_IPC_H - #include - #endif - #ifdef HAVE_SYS_SHM_H - #include - #endif - #ifdef HAVE_KERNEL_OS_H - #include - #endif - - #include "miscadmin.h" - #include "storage/ipc.h" - #include "storage/pg_shmem.h" - - - typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ - typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ - - #define IPCProtection (0600) /* access/modify by user only */ - - #ifdef SHM_SHARE_MMU /* use intimate shared memory on Solaris */ - #define PG_SHMAT_FLAGS SHM_SHARE_MMU - #else - #define PG_SHMAT_FLAGS 0 - #endif - - - unsigned long UsedShmemSegID = 0; - void *UsedShmemSegAddr = NULL; - - static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); - static void IpcMemoryDetach(int status, Datum shmaddr); - static void IpcMemoryDelete(int status, Datum shmId); - static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key, - IpcMemoryId *shmid); - - - /* - * InternalIpcMemoryCreate(memKey, size) - * - * Attempt to create a new shared memory segment with the specified key. - * Will fail (return NULL) if such a segment already exists. If successful, - * attach the segment to the current process and return its attached address. - * On success, callbacks are registered with on_shmem_exit to detach and - * delete the segment when on_shmem_exit is called. - * - * If we fail with a failure code other than collision-with-existing-segment, - * print out an error and abort. Other types of errors are not recoverable. - */ - static void * - InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) - { - IpcMemoryId shmid; - void *memAddress; - - shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); - - if (shmid < 0) - { - /* - * Fail quietly if error indicates a collision with existing segment. - * One would expect EEXIST, given that we said IPC_EXCL, but perhaps - * we could get a permission violation instead? Also, EIDRM might - * occur if an old seg is slated for destruction but not gone yet. - */ - if (errno == EEXIST || errno == EACCES - #ifdef EIDRM - || errno == EIDRM - #endif - ) - return NULL; - - /* - * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if - * there is an existing segment but it's smaller than "size" (this is - * a result of poorly-thought-out ordering of error tests). To - * distinguish between collision and invalid size in such cases, we - * make a second try with size = 0. These kernels do not test size - * against SHMMIN in the preexisting-segment case, so we will not get - * EINVAL a second time if there is such a segment. - */ - if (errno == EINVAL) - { - int save_errno = errno; - - shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); - - if (shmid < 0) - { - /* As above, fail quietly if we verify a collision */ - if (errno == EEXIST || errno == EACCES - #ifdef EIDRM - || errno == EIDRM - #endif - ) - return NULL; - /* Otherwise, fall through to report the original error */ - } - else - { - /* - * On most platforms we cannot get here because SHMMIN is - * greater than zero. However, if we do succeed in creating a - * zero-size segment, free it and then fall through to report - * the original error. - */ - if (shmctl(shmid, IPC_RMID, NULL) < 0) - elog(LOG, "shmctl(%d, %d, 0) failed: %m", - (int) shmid, IPC_RMID); - } - - errno = save_errno; - } - - /* - * Else complain and abort. - * - * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX - * is violated. SHMALL violation might be reported as either ENOMEM - * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which - * it should be. SHMMNI violation is ENOSPC, per spec. Just plain - * not-enough-RAM is ENOMEM. - */ - ereport(FATAL, - (errmsg("could not create shared memory segment: %m"), - errdetail("Failed system call was shmget(key=%lu, size=%lu, 0%o).", - (unsigned long) memKey, (unsigned long) size, - IPC_CREAT | IPC_EXCL | IPCProtection), - (errno == EINVAL) ? - errhint("This error usually means that PostgreSQL's request for a shared memory " - "segment exceeded your kernel's SHMMAX parameter. You can either " - "reduce the request size or reconfigure the kernel with larger SHMMAX. " - "To reduce the request size (currently %lu bytes), reduce " - "PostgreSQL's shared memory usage, perhaps by reducing shared_buffers " - "or max_connections.\n" - "If the request size is already small, it's possible that it is less than " - "your kernel's SHMMIN parameter, in which case raising the request size or " - "reconfiguring SHMMIN is called for.\n" - "The PostgreSQL documentation contains more information about shared " - "memory configuration.", - (unsigned long) size) : 0, - (errno == ENOMEM) ? - errhint("This error usually means that PostgreSQL's request for a shared " - "memory segment exceeded available memory or swap space, " - "or exceeded your kernel's SHMALL parameter. You can either " - "reduce the request size or reconfigure the kernel with larger SHMALL. " - "To reduce the request size (currently %lu bytes), reduce " - "PostgreSQL's shared memory usage, perhaps by reducing shared_buffers " - "or max_connections.\n" - "The PostgreSQL documentation contains more information about shared " - "memory configuration.", - (unsigned long) size) : 0, - (errno == ENOSPC) ? - errhint("This error does *not* mean that you have run out of disk space. " - "It occurs either if all available shared memory IDs have been taken, " - "in which case you need to raise the SHMMNI parameter in your kernel, " - "or because the system's overall limit for shared memory has been " - "reached. If you cannot increase the shared memory limit, " - "reduce PostgreSQL's shared memory request (currently %lu bytes), " - "perhaps by reducing shared_buffers or max_connections.\n" - "The PostgreSQL documentation contains more information about shared " - "memory configuration.", - (unsigned long) size) : 0)); - } - - /* Register on-exit routine to delete the new segment */ - on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); - - /* OK, should be able to attach to the segment */ - memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS); - - if (memAddress == (void *) -1) - elog(FATAL, "shmat(id=%d) failed: %m", shmid); - - /* Register on-exit routine to detach new segment before deleting */ - on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); - - /* - * Store shmem key and ID in data directory lockfile. Format to try to - * keep it the same length always (trailing junk in the lockfile won't - * hurt, but might confuse humans). - */ - { - char line[64]; - - sprintf(line, "%9lu %9lu", - (unsigned long) memKey, (unsigned long) shmid); - AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); - } - - return memAddress; - } - - /****************************************************************************/ - /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ - /* from process' address spaceq */ - /* (called as an on_shmem_exit callback, hence funny argument list) */ - /****************************************************************************/ - static void - IpcMemoryDetach(int status, Datum shmaddr) - { - if (shmdt(DatumGetPointer(shmaddr)) < 0) - elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr)); - } - - /****************************************************************************/ - /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ - /* (called as an on_shmem_exit callback, hence funny argument list) */ - /****************************************************************************/ - static void - IpcMemoryDelete(int status, Datum shmId) - { - if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) - elog(LOG, "shmctl(%d, %d, 0) failed: %m", - DatumGetInt32(shmId), IPC_RMID); - } - - /* - * PGSharedMemoryIsInUse - * - * Is a previously-existing shmem segment still existing and in use? - * - * The point of this exercise is to detect the case where a prior postmaster - * crashed, but it left child backends that are still running. Therefore - * we only care about shmem segments that are associated with the intended - * DataDir. This is an important consideration since accidental matches of - * shmem segment IDs are reasonably common. - */ - bool - PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) - { - IpcMemoryId shmId = (IpcMemoryId) id2; - struct shmid_ds shmStat; - struct stat statbuf; - PGShmemHeader *hdr; - - /* - * We detect whether a shared memory segment is in use by seeing whether - * it (a) exists and (b) has any processes attached to it. - */ - if (shmctl(shmId, IPC_STAT, &shmStat) < 0) - { - /* - * EINVAL actually has multiple possible causes documented in the - * shmctl man page, but we assume it must mean the segment no longer - * exists. - */ - if (errno == EINVAL) - return false; - - /* - * EACCES implies that the segment belongs to some other userid, which - * means it is not a Postgres shmem segment (or at least, not one that - * is relevant to our data directory). - */ - if (errno == EACCES) - return false; - - /* - * Some Linux kernel versions (in fact, all of them as of July 2007) - * sometimes return EIDRM when EINVAL is correct. The Linux kernel - * actually does not have any internal state that would justify - * returning EIDRM, so we can get away with assuming that EIDRM is - * equivalent to EINVAL on that platform. - */ - #ifdef HAVE_LINUX_EIDRM_BUG - if (errno == EIDRM) - return false; - #endif - - /* - * Otherwise, we had better assume that the segment is in use. The - * only likely case is EIDRM, which implies that the segment has been - * IPC_RMID'd but there are still processes attached to it. - */ - return true; - } - - /* If it has no attached processes, it's not in use */ - if (shmStat.shm_nattch == 0) - return false; - - /* - * Try to attach to the segment and see if it matches our data directory. - * This avoids shmid-conflict problems on machines that are running - * several postmasters under the same userid. - */ - if (stat(DataDir, &statbuf) < 0) - return true; /* if can't stat, be conservative */ - - hdr = (PGShmemHeader *) shmat(shmId, NULL, PG_SHMAT_FLAGS); - - if (hdr == (PGShmemHeader *) -1) - return true; /* if can't attach, be conservative */ - - if (hdr->magic != PGShmemMagic || - hdr->device != statbuf.st_dev || - hdr->inode != statbuf.st_ino) - { - /* - * It's either not a Postgres segment, or not one for my data - * directory. In either case it poses no threat. - */ - shmdt((void *) hdr); - return false; - } - - /* Trouble --- looks a lot like there's still live backends */ - shmdt((void *) hdr); - - return true; - } - - - /* - * PGSharedMemoryCreate - * - * Create a shared memory segment of the given size and initialize its - * standard header. Also, register an on_shmem_exit callback to release - * the storage. - * - * Dead Postgres segments are recycled if found, but we do not fail upon - * collision with non-Postgres shmem segments. The idea here is to detect and - * re-use keys that may have been assigned by a crashed postmaster or backend. - * - * makePrivate means to always create a new segment, rather than attach to - * or recycle any existing segment. - * - * The port number is passed for possible use as a key (for SysV, we use - * it to generate the starting shmem key). In a standalone backend, - * zero will be passed. - */ - PGShmemHeader * - PGSharedMemoryCreate(Size size, bool makePrivate, int port) - { - IpcMemoryKey NextShmemSegID; - void *memAddress; - PGShmemHeader *hdr; - IpcMemoryId shmid; - struct stat statbuf; - - /* Room for a header? */ - Assert(size > MAXALIGN(sizeof(PGShmemHeader))); - - /* Make sure PGSharedMemoryAttach doesn't fail without need */ - UsedShmemSegAddr = NULL; - - /* Loop till we find a free IPC key */ - NextShmemSegID = port * 1000; - - for (NextShmemSegID++;; NextShmemSegID++) - { - /* Try to create new segment */ - memAddress = InternalIpcMemoryCreate(NextShmemSegID, size); - if (memAddress) - break; /* successful create and attach */ - - /* Check shared memory and possibly remove and recreate */ - - if (makePrivate) /* a standalone backend shouldn't do this */ - continue; - - if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL) - continue; /* can't attach, not one of mine */ - - /* - * If I am not the creator and it belongs to an extant process, - * continue. - */ - hdr = (PGShmemHeader *) memAddress; - if (hdr->creatorPID != getpid()) - { - if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH) - { - shmdt(memAddress); - continue; /* segment belongs to a live process */ - } - } - - /* - * The segment appears to be from a dead Postgres process, or from a - * previous cycle of life in this same process. Zap it, if possible. - * This probably shouldn't fail, but if it does, assume the segment - * belongs to someone else after all, and continue quietly. - */ - shmdt(memAddress); - if (shmctl(shmid, IPC_RMID, NULL) < 0) - continue; - - /* - * Now try again to create the segment. - */ - memAddress = InternalIpcMemoryCreate(NextShmemSegID, size); - if (memAddress) - break; /* successful create and attach */ - - /* - * Can only get here if some other process managed to create the same - * shmem key before we did. Let him have that one, loop around to try - * next key. - */ - } - - /* - * OK, we created a new segment. Mark it as created by this process. The - * order of assignments here is critical so that another Postgres process - * can't see the header as valid but belonging to an invalid PID! - */ - hdr = (PGShmemHeader *) memAddress; - hdr->creatorPID = getpid(); - hdr->magic = PGShmemMagic; - - /* Fill in the data directory ID info, too */ - if (stat(DataDir, &statbuf) < 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not stat data directory \"%s\": %m", - DataDir))); - hdr->device = statbuf.st_dev; - hdr->inode = statbuf.st_ino; - - /* - * Initialize space allocation status for segment. - */ - hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); - - /* Save info for possible future use */ - UsedShmemSegAddr = memAddress; - UsedShmemSegID = (unsigned long) NextShmemSegID; - - return hdr; - } - - #ifdef EXEC_BACKEND - - /* - * PGSharedMemoryReAttach - * - * Re-attach to an already existing shared memory segment. In the non - * EXEC_BACKEND case this is not used, because postmaster children inherit - * the shared memory segment attachment via fork(). - * - * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this - * routine. The caller must have already restored them to the postmaster's - * values. - */ - void - PGSharedMemoryReAttach(void) - { - IpcMemoryId shmid; - void *hdr; - void *origUsedShmemSegAddr = UsedShmemSegAddr; - - Assert(UsedShmemSegAddr != NULL); - Assert(IsUnderPostmaster); - - #ifdef __CYGWIN__ - /* cygipc (currently) appears to not detach on exec. */ - PGSharedMemoryDetach(); - UsedShmemSegAddr = origUsedShmemSegAddr; - #endif - - elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); - hdr = (void *) PGSharedMemoryAttach((IpcMemoryKey) UsedShmemSegID, &shmid); - if (hdr == NULL) - elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m", - (int) UsedShmemSegID, UsedShmemSegAddr); - if (hdr != origUsedShmemSegAddr) - elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", - hdr, origUsedShmemSegAddr); - - UsedShmemSegAddr = hdr; /* probably redundant */ - } - #endif /* EXEC_BACKEND */ - - /* - * PGSharedMemoryDetach - * - * Detach from the shared memory segment, if still attached. This is not - * intended for use by the process that originally created the segment - * (it will have an on_shmem_exit callback registered to do that). Rather, - * this is for subprocesses that have inherited an attachment and want to - * get rid of it. - */ - void - PGSharedMemoryDetach(void) - { - if (UsedShmemSegAddr != NULL) - { - if ((shmdt(UsedShmemSegAddr) < 0) - #if defined(EXEC_BACKEND) && defined(__CYGWIN__) - /* Work-around for cygipc exec bug */ - && shmdt(NULL) < 0 - #endif - ) - elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); - UsedShmemSegAddr = NULL; - } - } - - - /* - * Attach to shared memory and make sure it has a Postgres header - * - * Returns attach address if OK, else NULL - */ - static PGShmemHeader * - PGSharedMemoryAttach(IpcMemoryKey key, IpcMemoryId *shmid) - { - PGShmemHeader *hdr; - - if ((*shmid = shmget(key, sizeof(PGShmemHeader), 0)) < 0) - return NULL; - - hdr = (PGShmemHeader *) shmat(*shmid, UsedShmemSegAddr, PG_SHMAT_FLAGS); - - if (hdr == (PGShmemHeader *) -1) - return NULL; /* failed: must be some other app's */ - - if (hdr->magic != PGShmemMagic) - { - shmdt((void *) hdr); - return NULL; /* segment belongs to a non-Postgres app */ - } - - return hdr; - } --- 0 ---- *** a/src/backend/postmaster/autovacuum.c --- b/src/backend/postmaster/autovacuum.c *************** *** 368,373 **** StartAutoVacLauncher(void) --- 368,376 ---- /* Lose the postmaster's on-exit routines */ on_exit_reset(); + /* Hold on to the data directory lock until this process dies. */ + AcquireDataDirLock(); + AutoVacLauncherMain(0, NULL); break; #endif *** a/src/backend/postmaster/pgstat.c --- b/src/backend/postmaster/pgstat.c *************** *** 632,637 **** pgstat_start(void) --- 632,640 ---- /* Lose the postmaster's on-exit routines */ on_exit_reset(); + /* Hold on to the data directory lock for all long as we live.*/ + AcquireDataDirLock(); + /* Drop our connection to postmaster's shared memory, as well */ PGSharedMemoryDetach(); *** a/src/backend/postmaster/postmaster.c --- b/src/backend/postmaster/postmaster.c *************** *** 484,489 **** PostmasterMain(int argc, char *argv[]) --- 484,490 ---- char *userDoption = NULL; bool listen_addr_saved = false; int i; + bool blockOnStartupLockOption = false; MyProcPid = PostmasterPid = getpid(); *************** *** 529,535 **** PostmasterMain(int argc, char *argv[]) * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ ! while ((opt = getopt(argc, argv, "A:B:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) { switch (opt) { --- 530,536 ---- * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ ! while ((opt = getopt(argc, argv, "A:bB:c:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1) { switch (opt) { *************** *** 537,542 **** PostmasterMain(int argc, char *argv[]) --- 538,546 ---- SetConfigOption("debug_assertions", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; + case 'b': + blockOnStartupLockOption = true; + break; case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; *************** *** 790,796 **** PostmasterMain(int argc, char *argv[]) * For the same reason, it's best to grab the TCP socket(s) before the * Unix socket. */ ! CreateDataDirLockFile(true); /* * If timezone is not set, determine what the OS uses. (In theory this --- 794,800 ---- * For the same reason, it's best to grab the TCP socket(s) before the * Unix socket. */ ! CreateDataDirLockFile(true,blockOnStartupLockOption); /* * If timezone is not set, determine what the OS uses. (In theory this *** a/src/backend/tcop/postgres.c --- b/src/backend/tcop/postgres.c *************** *** 3600,3606 **** PostgresMain(int argc, char *argv[], const char *username) /* * Create lockfile for data directory. */ ! CreateDataDirLockFile(false); } /* Early initialization */ --- 3600,3606 ---- /* * Create lockfile for data directory. */ ! CreateDataDirLockFile(false,false); } /* Early initialization */ *************** *** 3618,3623 **** PostgresMain(int argc, char *argv[], const char *username) --- 3618,3633 ---- #else InitProcess(); #endif + if(IsUnderPostmaster) + { + /* acquire the lock file advisory lock (to eliminate multiple-postmaster race conditions) + * postgresql backends (postmaster children) must acquire the read lock to signify that there are backends operating in the specific data directory + * this needs to be done after InitProcess because the function needs access to the shared memory proc array + */ + AcquireDataDirLock(); + } + + /* We need to allow SIGINT, etc during the initial transaction */ PG_SETMASK(&UnBlockSig); *** a/src/backend/utils/init/miscinit.c --- b/src/backend/utils/init/miscinit.c *************** *** 44,57 **** #include "utils/memutils.h" #include "utils/syscache.h" #define DIRECTORY_LOCK_FILE "postmaster.pid" ProcessingMode Mode = InitProcessing; - /* Note: we rely on this to initialize as zeroes */ - static char socketLockFile[MAXPGPATH]; - /* ---------------------------------------------------------------- * ignoring system indexes support stuff --- 44,75 ---- #include "utils/memutils.h" #include "utils/syscache.h" + /* Note: we rely on this to initialize as zeroes */ + static char socketLockFile[MAXPGPATH]; #define DIRECTORY_LOCK_FILE "postmaster.pid" + static pid_t GetPIDHoldingLock(int fileDescriptor,bool exclusiveLockFlag); + static int AcquireLock(int fileDescriptor,bool exclusiveLockFlag,bool waitForLock); + static int ReleaseLock(int fileDescriptor); + void AcquireDataDirLock(); + pid_t GetPIDHoldingDataDirLock(); + static void WriteLockFileContents(char *lockFilePath,int lockFileFD,bool isPostmasterFlag,pid_t processPid,char *dataDirectoryPath,long startTime,int portNumber,char * socketDirectory); + + /* enum used by CreateLockFile to report its success or error condition */ + typedef enum + { + /* positive numbers refer to the PID of a conflicting lock-holding process, but not necessarily a postmaster */ + CreateLockFileNoError=0, + CreateLockFileFileError=-1, /*advises the caller to check the errno for the error */ + CreateLockFileSharedLockAcquisitionError=-2, + CreateLockFileExclusiveLockCheckError=-3, + } CreateLockFileValue; + + + static CreateLockFileValue CreateLockFile(char *lockFilePath,bool amPostmaster,int *lockFileRetDescriptor,bool blockOnLockFlag); ProcessingMode Mode = InitProcessing; /* ---------------------------------------------------------------- * ignoring system indexes support stuff *************** *** 627,635 **** GetUserNameFromId(Oid roleid) /*------------------------------------------------------------------------- * Interlock-file support * ! * These routines are used to create both a data-directory lockfile ! * ($DATADIR/postmaster.pid) and a Unix-socket-file lockfile ($SOCKFILE.lock). ! * Both kinds of files contain the same info: * * Owning process' PID * Data directory path --- 645,653 ---- /*------------------------------------------------------------------------- * Interlock-file support * ! * These routines are used to create a data-directory lockfile ! * ($DATADIR/postmaster.pid). ! * The file contains the info: * * Owning process' PID * Data directory path *************** *** 642,963 **** GetUserNameFromId(Oid roleid) * A data-directory lockfile can optionally contain a third line, containing * the key and ID for the shared memory block used by this postmaster. * - * On successful lockfile creation, a proc_exit callback to remove the - * lockfile is automatically created. *------------------------------------------------------------------------- */ ! /* ! * proc_exit callback to remove a lockfile. ! */ ! static void ! UnlinkLockFile(int status, Datum filename) ! { ! char *fname = (char *) DatumGetPointer(filename); ! ! if (fname != NULL) ! { ! if (unlink(fname) != 0) ! { ! /* Should we complain if the unlink fails? */ ! } ! free(fname); ! } ! } /* ! * Create a lockfile. * ! * filename is the name of the lockfile to create. ! * amPostmaster is used to determine how to encode the output PID. ! * isDDLock and refName are used to determine what error message to produce. */ ! static void ! CreateLockFile(const char *filename, bool amPostmaster, ! bool isDDLock, const char *refName) { ! int fd; ! char buffer[MAXPGPATH * 2 + 256]; ! int ntries; ! int len; ! int encoded_pid; ! pid_t other_pid; ! pid_t my_pid, ! my_p_pid, ! my_gp_pid; ! const char *envvar; ! ! /* ! * If the PID in the lockfile is our own PID or our parent's or ! * grandparent's PID, then the file must be stale (probably left over from ! * a previous system boot cycle). We need to check this because of the ! * likelihood that a reboot will assign exactly the same PID as we had in ! * the previous reboot, or one that's only one or two counts larger and ! * hence the lockfile's PID now refers to an ancestor shell process. We ! * allow pg_ctl to pass down its parent shell PID (our grandparent PID) ! * via the environment variable PG_GRANDPARENT_PID; this is so that ! * launching the postmaster via pg_ctl can be just as reliable as ! * launching it directly. There is no provision for detecting ! * further-removed ancestor processes, but if the init script is written ! * carefully then all but the immediate parent shell will be root-owned ! * processes and so the kill test will fail with EPERM. Note that we ! * cannot get a false negative this way, because an existing postmaster ! * would surely never launch a competing postmaster or pg_ctl process ! * directly. ! */ ! my_pid = getpid(); ! ! #ifndef WIN32 ! my_p_pid = getppid(); ! #else ! ! /* ! * Windows hasn't got getppid(), but doesn't need it since it's not using ! * real kill() either... ! */ ! my_p_pid = 0; ! #endif ! ! envvar = getenv("PG_GRANDPARENT_PID"); ! if (envvar) ! my_gp_pid = atoi(envvar); ! else ! my_gp_pid = 0; ! ! /* ! * We need a loop here because of race conditions. But don't loop forever ! * (for example, a non-writable $PGDATA directory might cause a failure ! * that won't go away). 100 tries seems like plenty. ! */ ! for (ntries = 0;; ntries++) ! { ! /* ! * Try to create the lock file --- O_EXCL makes this atomic. ! * ! * Think not to make the file protection weaker than 0600. See ! * comments below. ! */ ! fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0600); ! if (fd >= 0) ! break; /* Success; exit the retry loop */ ! ! /* ! * Couldn't create the pid file. Probably it already exists. ! */ ! if ((errno != EEXIST && errno != EACCES) || ntries > 100) ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not create lock file \"%s\": %m", ! filename))); ! ! /* ! * Read the file to get the old owner's PID. Note race condition ! * here: file might have been deleted since we tried to create it. ! */ ! fd = open(filename, O_RDONLY, 0600); ! if (fd < 0) ! { ! if (errno == ENOENT) ! continue; /* race condition; try again */ ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not open lock file \"%s\": %m", ! filename))); ! } ! if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0) ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not read lock file \"%s\": %m", ! filename))); ! close(fd); ! ! buffer[len] = '\0'; ! encoded_pid = atoi(buffer); ! ! /* if pid < 0, the pid is for postgres, not postmaster */ ! other_pid = (pid_t) (encoded_pid < 0 ? -encoded_pid : encoded_pid); ! ! if (other_pid <= 0) ! elog(FATAL, "bogus data in lock file \"%s\": \"%s\"", ! filename, buffer); ! ! /* ! * Check to see if the other process still exists ! * ! * Per discussion above, my_pid, my_p_pid, and my_gp_pid can be ! * ignored as false matches. ! * ! * Normally kill() will fail with ESRCH if the given PID doesn't ! * exist. ! * ! * We can treat the EPERM-error case as okay because that error ! * implies that the existing process has a different userid than we ! * do, which means it cannot be a competing postmaster. A postmaster ! * cannot successfully attach to a data directory owned by a userid ! * other than its own. (This is now checked directly in ! * checkDataDir(), but has been true for a long time because of the ! * restriction that the data directory isn't group- or ! * world-accessible.) Also, since we create the lockfiles mode 600, ! * we'd have failed above if the lockfile belonged to another userid ! * --- which means that whatever process kill() is reporting about ! * isn't the one that made the lockfile. (NOTE: this last ! * consideration is the only one that keeps us from blowing away a ! * Unix socket file belonging to an instance of Postgres being run by ! * someone else, at least on machines where /tmp hasn't got a ! * stickybit.) ! */ ! if (other_pid != my_pid && other_pid != my_p_pid && ! other_pid != my_gp_pid) ! { ! if (kill(other_pid, 0) == 0 || ! (errno != ESRCH && errno != EPERM)) ! { ! /* lockfile belongs to a live process */ ! ereport(FATAL, ! (errcode(ERRCODE_LOCK_FILE_EXISTS), ! errmsg("lock file \"%s\" already exists", ! filename), ! isDDLock ? ! (encoded_pid < 0 ? ! errhint("Is another postgres (PID %d) running in data directory \"%s\"?", ! (int) other_pid, refName) : ! errhint("Is another postmaster (PID %d) running in data directory \"%s\"?", ! (int) other_pid, refName)) : ! (encoded_pid < 0 ? ! errhint("Is another postgres (PID %d) using socket file \"%s\"?", ! (int) other_pid, refName) : ! errhint("Is another postmaster (PID %d) using socket file \"%s\"?", ! (int) other_pid, refName)))); ! } ! } ! ! /* ! * No, the creating process did not exist. However, it could be that ! * the postmaster crashed (or more likely was kill -9'd by a clueless ! * admin) but has left orphan backends behind. Check for this by ! * looking to see if there is an associated shmem segment that is ! * still in use. ! * ! * Note: because postmaster.pid is written in multiple steps, we might ! * not find the shmem ID values in it; we can't treat that as an ! * error. ! */ ! if (isDDLock) ! { ! char *ptr = buffer; ! unsigned long id1, ! id2; ! int lineno; ! ! for (lineno = 1; lineno < LOCK_FILE_LINE_SHMEM_KEY; lineno++) ! { ! if ((ptr = strchr(ptr, '\n')) == NULL) ! break; ! ptr++; ! } ! ! if (ptr != NULL && ! sscanf(ptr, "%lu %lu", &id1, &id2) == 2) ! { ! if (PGSharedMemoryIsInUse(id1, id2)) ! ereport(FATAL, ! (errcode(ERRCODE_LOCK_FILE_EXISTS), ! errmsg("pre-existing shared memory block " ! "(key %lu, ID %lu) is still in use", ! id1, id2), ! errhint("If you're sure there are no old " ! "server processes still running, remove " ! "the shared memory block " ! "or just delete the file \"%s\".", ! filename))); ! } ! } ! ! /* ! * Looks like nobody's home. Unlink the file and try again to create ! * it. Need a loop because of possible race condition against other ! * would-be creators. ! */ ! if (unlink(filename) < 0) ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not remove old lock file \"%s\": %m", ! filename), ! errhint("The file seems accidentally left over, but " ! "it could not be removed. Please remove the file " ! "by hand and try again."))); ! } ! /* ! * Successfully created the file, now fill it. See comment in miscadmin.h ! * about the contents. Note that we write the same info into both datadir ! * and socket lockfiles; although more stuff may get added to the datadir ! * lockfile later. ! */ ! snprintf(buffer, sizeof(buffer), "%d\n%s\n%ld\n%d\n%s\n", ! amPostmaster ? (int) my_pid : -((int) my_pid), ! DataDir, ! (long) MyStartTime, ! PostPortNumber, #ifdef HAVE_UNIX_SOCKETS ! (*UnixSocketDir != '\0') ? UnixSocketDir : DEFAULT_PGSOCKET_DIR #else ! "" #endif ! ); ! errno = 0; ! if (write(fd, buffer, strlen(buffer)) != strlen(buffer)) ! { ! int save_errno = errno; ! ! close(fd); ! unlink(filename); ! /* if write didn't set errno, assume problem is no disk space */ ! errno = save_errno ? save_errno : ENOSPC; ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not write lock file \"%s\": %m", filename))); ! } ! if (pg_fsync(fd) != 0) ! { ! int save_errno = errno; ! ! close(fd); ! unlink(filename); ! errno = save_errno; ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not write lock file \"%s\": %m", filename))); ! } ! if (close(fd) != 0) ! { ! int save_errno = errno; ! unlink(filename); ! errno = save_errno; ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not write lock file \"%s\": %m", filename))); ! } ! /* ! * Arrange for automatic removal of lockfile at proc_exit. ! */ ! on_proc_exit(UnlinkLockFile, PointerGetDatum(strdup(filename))); } /* ! * Create the data directory lockfile. ! * ! * When this is called, we must have already switched the working ! * directory to DataDir, so we can just use a relative path. This ! * helps ensure that we are locking the directory we should be. */ ! void ! CreateDataDirLockFile(bool amPostmaster) { ! CreateLockFile(DIRECTORY_LOCK_FILE, amPostmaster, true, DataDir); } /* --- 660,888 ---- * A data-directory lockfile can optionally contain a third line, containing * the key and ID for the shared memory block used by this postmaster. * *------------------------------------------------------------------------- */ ! /* We hold onto the lockFile for the life of the process to hold onto the advisory locks. */ ! static int DataDirLockFileFD = 0; /* ! * Create the data directory lockfile. * ! * When this is called, we must have already switched the working ! * directory to DataDir, so we can just use a relative path. This ! * helps ensure that we are locking the directory we should be. */ ! void ! CreateDataDirLockFile(bool amPostmaster,bool blockOptionFlag) { ! char *lockFilePath=DIRECTORY_LOCK_FILE; ! CreateLockFileValue error = CreateLockFile(lockFilePath,amPostmaster,&DataDirLockFileFD,blockOptionFlag); ! if(error==CreateLockFileNoError) ! { ! return; ! } ! else if(error==CreateLockFileFileError) ! { ! ereport(FATAL, ! (errmsg("failed operation on lock file at \"%s\": %m",lockFilePath))); ! } ! else if(error==CreateLockFileSharedLockAcquisitionError) ! { ! ereport(FATAL, ! (errmsg("failed to acquire shared lock on file \"%s\": %m",lockFilePath))); ! } ! else if(error==CreateLockFileExclusiveLockCheckError) ! { ! ereport(FATAL, ! (errmsg("failed to check for exclusive lock on file \"%s\": %m", lockFilePath))); ! ! } ! else if(error>0) ! { ! /* error holds the pid of the conflicting process */ ! ereport(FATAL, ! (errmsg("another postgresql process is running in the data directory \"%s\" with pid %d",DataDir,error), ! errhint("kill the other server processes to start a new postgresql server"))); ! } ! else ! { ! ereport(FATAL, ! (errmsg("an unhandled locking error occurred"))); ! } ! } ! static CreateLockFileValue ! CreateLockFile(char *lockFilePath,bool amPostmaster,int *lockFileRetDescriptor,bool blockOnLockFlag) ! { ! int success = 0; ! pid_t pidHoldingLock; ! int lockFileFD = 0; ! ! /* open the directory lock file- whether or not it already exists is irrelevant because we will check for file locks */ ! lockFileFD = open(lockFilePath, O_RDWR | O_CREAT, 0600); ! if(lockFileFD<0) ! { ! return CreateLockFileFileError; ! } ! ! if(!blockOnLockFlag) ! { ! /* Acquire the shared advisory lock without blocking*/ ! ! success = AcquireLock(lockFileFD,false,false); ! if(success < 0) ! { ! /* We failed to acquire the read lock, which is unlikely because we no one should be holding an exclusive lock on it. */ ! return CreateLockFileSharedLockAcquisitionError; ! } ! } ! else ! { ! /* loop until we get the exclusive lock and the subsequent shared lock- waiting until the data directory is not being serviced is data directory postgresql hot standy mode*/ ! while(1) ! { ! success = AcquireLock(lockFileFD,true,true); ! if(success < 0) ! return CreateLockFileExclusiveLockCheckError; ! ! /* now we hold the exclusive lock, so demote to read lock, but be wary of the race condition whereby a different postmaster could also be waiting to grab the read lock too */ ! success = ReleaseLock(lockFileFD); ! if(success < 0) ! return CreateLockFileExclusiveLockCheckError; ! ! success = AcquireLock(lockFileFD,false,false); ! if(success < 0) ! { ! /* d'oh- some other postmaster grabbed the exclusive lock in the meantime, so try again later */ ! pg_usleep(500L); ! continue; ! } ! else ! break; ! } ! } ! ! ! /* Determine if acquiring an exclusive (write) lock would be denied. If so, there is another postmaster or postgres child process running, so abort. */ ! pidHoldingLock = GetPIDHoldingLock(lockFileFD,true); ! if(pidHoldingLock < 0) ! { ! /* checking for a lock failed */ ! return CreateLockFileExclusiveLockCheckError; ! } ! else if(pidHoldingLock > 0) ! { ! /* there is another process holding the lock, so we must abort starting a new postmaster */ ! return pidHoldingLock; ! } ! /*no process would block the lock, so we are cleared for starting a new postmaster*/ ! WriteLockFileContents(lockFilePath,lockFileFD, ! true, ! getpid(), ! DataDir, ! (long)MyStartTime, ! PostPortNumber, #ifdef HAVE_UNIX_SOCKETS ! (*UnixSocketDir != '\0') ? UnixSocketDir : DEFAULT_PGSOCKET_DIR #else ! "" #endif ! ); ! /* There is no need to remove the lock file because the locks synchronize access, not the existence of the file. */ ! if(lockFileRetDescriptor != NULL) ! *lockFileRetDescriptor = lockFileFD; ! return CreateLockFileNoError; ! } ! static void WriteLockFileContents(char *lockFilePath,int lockFileFD,bool isPostmasterFlag,pid_t processPid,char *dataDirectoryPath,long startTime,int portNumber,char * socketDirectoryPath) ! { ! char writeBuffer[MAXPGPATH * 2 + 256]; ! snprintf(writeBuffer,sizeof(writeBuffer),"%d\n%s\n%ld\n%d\n%s\n", ! isPostmasterFlag ? (int) processPid : -((int) processPid), ! dataDirectoryPath, ! startTime, ! portNumber, ! socketDirectoryPath ! ); ! errno = 0; ! if (write(lockFileFD, writeBuffer, strlen(writeBuffer)) != strlen(writeBuffer)) ! { ! int save_errno = errno; ! unlink(lockFilePath); ! /* if write didn't set errno, assume problem is no disk space */ ! errno = save_errno ? save_errno : ENOSPC; ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not write lock file \"%s\": %m", lockFilePath))); ! } ! if (pg_fsync(lockFileFD) != 0) ! { ! int save_errno = errno; ! ! unlink(lockFilePath); ! errno = save_errno; ! ereport(FATAL, ! (errcode_for_file_access(), ! errmsg("could not write lock file \"%s\": %m", lockFilePath))); ! } ! return; ! } ! /* Called by pg_ctl to determine when the postmaster is shutdown. */ ! pid_t GetPIDHoldingDataDirLock(void) ! { ! return GetPIDHoldingLock(DataDirLockFileFD,true); } + /* ! * Called by backends when they startup to signify that the data directory is in use */ ! void AcquireDataDirLock(void) { ! int success; ! int exclusiveLockViolatingPID = 0; ! /* get the read lock */ ! success = AcquireLock(DataDirLockFileFD,false,false); ! if(success < 0) ! { ! /* Failed to acquire read lock, bomb out */ ! ereport(FATAL, ! (errmsg("failed to acquire lock on \"%s\": %m",DIRECTORY_LOCK_FILE))); ! ! } ! /* verify that grabbing an exclusive lock would complain that the parent or PROC_ARRAY sibling process would cause exclusive lock acquisition to fail- otherwise a separate postmaster is holding the lock (eliminates a possible race condition when the postmaster spawns a backend, immediately dies and new postmaster takes over) */ ! exclusiveLockViolatingPID = GetPIDHoldingLock(DataDirLockFileFD,true); ! if(exclusiveLockViolatingPID < 0) ! { ! /* error testing for the lock, very unlikely, but fatal */ ! ereport(FATAL, ! (errmsg("failed to test for lock on \"%s\": %m",DIRECTORY_LOCK_FILE))); ! } ! else if(exclusiveLockViolatingPID == 0) ! { ! /* the postmaster should be holding the lock- in this case it is not (and this is the only backend running), so don't bother running the backend because the postmaster just died */ ! ereport(FATAL, ! (errmsg("failed to initialize backend because the postmaster exited"))); ! } ! else ! { ! /* the PID is valid, so we should check that the PID refers either to the postmaster or its children */ ! /* NOTE TO REVIEWER: is this too early to call BackendPidGetProc? */ ! PGPROC *violatingProc = NULL; ! violatingProc = BackendPidGetProc(exclusiveLockViolatingPID); ! ! if(exclusiveLockViolatingPID != getppid() && ! violatingProc == NULL) ! { ! /* the violating lock is neither the postmaster nor a sibling child- data directory conflict detected! */ ! ereport(FATAL, ! (errmsg("backend startup race condition detected- another postmaster is running in this data directory"))); ! } ! } ! return; } /* *************** *** 966,977 **** CreateDataDirLockFile(bool amPostmaster) void CreateSocketLockFile(const char *socketfile, bool amPostmaster) { ! char lockfile[MAXPGPATH]; ! ! snprintf(lockfile, sizeof(lockfile), "%s.lock", socketfile); ! CreateLockFile(lockfile, amPostmaster, false, socketfile); ! /* Save name of lockfile for TouchSocketLockFile */ ! strcpy(socketLockFile, lockfile); } /* --- 891,939 ---- void CreateSocketLockFile(const char *socketfile, bool amPostmaster) { ! char lockFilePath[MAXPGPATH]; ! CreateLockFileValue error; ! ! snprintf(lockFilePath, sizeof(lockFilePath), "%s.lock", socketfile); ! ! /* This intentionally leaks the socket file descriptor- we hold onto it so that the lock is held until the process is exited */ ! error = CreateLockFile(lockFilePath, amPostmaster,NULL,false); ! ! if(error==CreateLockFileNoError) ! { ! return; ! } ! else if(error==CreateLockFileFileError) ! { ! ereport(FATAL, ! (errmsg("failed operation on lock file at \"%s\": %m",lockFilePath))); ! } ! else if(error==CreateLockFileSharedLockAcquisitionError) ! { ! ereport(FATAL, ! (errmsg("failed to acquire shared lock on file \"%s\": %m",lockFilePath))); ! } ! else if(error==CreateLockFileExclusiveLockCheckError) ! { ! ereport(FATAL, ! (errmsg("failed to check for exclusive lock on file \"%s\": %m", lockFilePath))); ! ! } ! else if(error>0) ! { ! /* error holds the pid of the conflicting process */ ! ereport(FATAL, ! (errmsg("another postgresql process with pid %d is bound to the socket file at \"%s\"",error,lockFilePath), ! errhint("configure a different socket file path in postgresql.conf or kill the conflicting postgresql server"))); ! } ! else ! { ! ereport(FATAL, ! (errmsg("an unhandled locking error occurred"))); ! } ! ! /* Save name of lockfile for TouchSocketLockFile */ ! strcpy(socketLockFile, lockFilePath); } /* *************** *** 985,1020 **** CreateSocketLockFile(const char *socketfile, bool amPostmaster) void TouchSocketLockFile(void) { ! /* Do nothing if we did not create a socket... */ ! if (socketLockFile[0] != '\0') ! { ! /* ! * utime() is POSIX standard, utimes() is a common alternative; if we ! * have neither, fall back to actually reading the file (which only ! * sets the access time not mod time, but that should be enough in ! * most cases). In all paths, we ignore errors. ! */ #ifdef HAVE_UTIME ! utime(socketLockFile, NULL); ! #else /* !HAVE_UTIME */ #ifdef HAVE_UTIMES ! utimes(socketLockFile, NULL); ! #else /* !HAVE_UTIMES */ ! int fd; ! char buffer[1]; ! ! fd = open(socketLockFile, O_RDONLY | PG_BINARY, 0); ! if (fd >= 0) ! { ! read(fd, buffer, sizeof(buffer)); ! close(fd); ! } #endif /* HAVE_UTIMES */ #endif /* HAVE_UTIME */ ! } } - /* * Add (or replace) a line in the data directory lock file. * The given string should not include a trailing newline. --- 947,980 ---- void TouchSocketLockFile(void) { ! /* Do nothing if we did not create a socket... */ ! if (socketLockFile[0] != '\0') ! { ! /* ! * utime() is POSIX standard, utimes() is a common alternative; if we ! * have neither, fall back to actually reading the file (which only ! * sets the access time not mod time, but that should be enough in ! * most cases). In all paths, we ignore errors. ! */ #ifdef HAVE_UTIME ! utime(socketLockFile, NULL); ! #else/* !HAVE_UTIME */ #ifdef HAVE_UTIMES ! utimes(socketLockFile, NULL); ! #else/* !HAVE_UTIMES */ ! intfd; ! charbuffer[1]; ! ! fd = open(socketLockFile, O_RDONLY | PG_BINARY, 0); ! if (fd >= 0) ! { ! read(fd, buffer, sizeof(buffer)); ! } #endif /* HAVE_UTIMES */ #endif /* HAVE_UTIME */ ! } } /* * Add (or replace) a line in the data directory lock file. * The given string should not include a trailing newline. *************** *** 1030,1043 **** AddToDataDirLockFile(int target_line, const char *str) int lineno; char *ptr; char buffer[BLCKSZ]; ! fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0); ! if (fd < 0) { ereport(LOG, (errcode_for_file_access(), ! errmsg("could not open file \"%s\": %m", ! DIRECTORY_LOCK_FILE))); return; } len = read(fd, buffer, sizeof(buffer) - 1); --- 990,1006 ---- int lineno; char *ptr; char buffer[BLCKSZ]; + int success; ! fd = DataDirLockFileFD; ! /* rewind the file handle to rewrite it */ ! success = lseek(fd,0,SEEK_SET); ! if (success < 0) { ereport(LOG, (errcode_for_file_access(), ! errmsg("could not seek lock file \"%s\": %m", ! DIRECTORY_LOCK_FILE))); return; } len = read(fd, buffer, sizeof(buffer) - 1); *************** *** 1047,1053 **** AddToDataDirLockFile(int target_line, const char *str) (errcode_for_file_access(), errmsg("could not read from file \"%s\": %m", DIRECTORY_LOCK_FILE))); - close(fd); return; } buffer[len] = '\0'; --- 1010,1015 ---- *************** *** 1061,1067 **** AddToDataDirLockFile(int target_line, const char *str) if ((ptr = strchr(ptr, '\n')) == NULL) { elog(LOG, "bogus data in \"%s\"", DIRECTORY_LOCK_FILE); - close(fd); return; } ptr++; --- 1023,1028 ---- *************** *** 1088,1094 **** AddToDataDirLockFile(int target_line, const char *str) (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", DIRECTORY_LOCK_FILE))); - close(fd); return; } if (pg_fsync(fd) != 0) --- 1049,1054 ---- *************** *** 1098,1110 **** AddToDataDirLockFile(int target_line, const char *str) errmsg("could not write to file \"%s\": %m", DIRECTORY_LOCK_FILE))); } - if (close(fd) != 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", - DIRECTORY_LOCK_FILE))); - } } --- 1058,1063 ---- *************** *** 1300,1302 **** pg_bindtextdomain(const char *domain) --- 1253,1300 ---- } #endif } + + /* We can also offer the option to block until the other postmaster is cleared away using F_SETLKW */ + static int AcquireLock(int fileDescriptor,bool exclusiveLockFlag,bool waitForLock) + { + struct flock lock = { + .l_type = exclusiveLockFlag ? F_WRLCK : F_RDLCK, + .l_start = 0, + .l_whence = SEEK_SET, + .l_len = 100 + }; + return fcntl(fileDescriptor , waitForLock ? F_SETLKW : F_SETLK, &lock); + } + + static int ReleaseLock(int fileDescriptor) + { + struct flock lock = { + .l_type = F_UNLCK, + .l_start = 0, + .l_whence = SEEK_SET, + .l_len = 100 + }; + return fcntl(fileDescriptor, F_SETLK, &lock); + } + + static pid_t GetPIDHoldingLock(int fileDescriptor,bool exclusiveLockFlag) + { + struct flock lock = { + .l_type = exclusiveLockFlag ? F_WRLCK : F_RDLCK, + .l_start = 0, + .l_whence = SEEK_SET, + .l_len = 100, + .l_pid = 0 + }; + int success; + + success = fcntl(fileDescriptor,F_GETLK,&lock); + if(success < 0) + { + return (pid_t)success; + } + if(lock.l_whence == SEEK_SET) + return lock.l_pid; + else + return -1; + } *** a/src/bin/pg_ctl/pg_ctl.c --- b/src/bin/pg_ctl/pg_ctl.c *************** *** 26,31 **** --- 26,32 ---- #include #include #include + #include #ifdef HAVE_SYS_RESOURCE_H #include *************** *** 271,297 **** get_pgpid(void) { FILE *pidf; long pid; pidf = fopen(pid_file, "r"); ! if (pidf == NULL) ! { ! /* No pid file, not an error on startup */ ! if (errno == ENOENT) ! return 0; ! else ! { ! write_stderr(_("%s: could not open PID file \"%s\": %s\n"), ! progname, pid_file, strerror(errno)); ! exit(1); ! } ! } ! if (fscanf(pidf, "%ld", &pid) != 1) ! { ! write_stderr(_("%s: invalid data in PID file \"%s\"\n"), ! progname, pid_file); ! exit(1); ! } fclose(pidf); return (pgpid_t) pid; } --- 272,320 ---- { FILE *pidf; long pid; + struct flock lock = { + .l_type = F_WRLCK, + .l_start = 0, + .l_whence = SEEK_SET, + .l_len = 100, + .l_pid = 0 + }; + int success; pidf = fopen(pid_file, "r"); ! ! /* Attempt to acquire an exclusive lock. If that fails, we know that an existing backend is still holding a read lock. See src/backend/utils/init/miscinit.c for more details. */ ! if(pidf == NULL) ! { ! if(errno == ENOENT) ! { ! /* No lock file found. */ ! return 0; ! } ! else ! { ! write_stderr(_("%s: could not open PID file \"%s\": %s\n"), ! progname,pid_file,strerror(errno)); ! exit(1); ! } ! } ! success = fcntl(fileno(pidf),F_GETLK,&lock); fclose(pidf); + if(success < 0) + { + /* Failed syscall */ + write_stderr(_("%s: failed to test lock status: %s"),progname,strerror(errno)); + exit(1); + } + if(lock.l_whence == SEEK_SET) + /* There is a pid holding a lock. */ + return lock.l_pid; + else if(lock.l_type == F_UNLCK) + /* No lock would block exclusive access. */ + return 0; + else + return -1; + return (pgpid_t) pid; } *** a/src/include/miscadmin.h --- b/src/include/miscadmin.h *************** *** 373,379 **** extern char *local_preload_libraries_string; #define LOCK_FILE_LINE_LISTEN_ADDR 6 #define LOCK_FILE_LINE_SHMEM_KEY 7 ! extern void CreateDataDirLockFile(bool amPostmaster); extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster); extern void TouchSocketLockFile(void); extern void AddToDataDirLockFile(int target_line, const char *str); --- 373,381 ---- #define LOCK_FILE_LINE_LISTEN_ADDR 6 #define LOCK_FILE_LINE_SHMEM_KEY 7 ! extern void CreateDataDirLockFile(bool amPostmaster,bool blockOptionFlag); ! extern void AcquireDataDirLock(void); ! extern pid_t GetPIDHoldingDataDirLock(void); extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster); extern void TouchSocketLockFile(void); extern void AddToDataDirLockFile(int target_line, const char *str);