COMMIT NOWAIT Performance Option (patch)
A prototype patch is posted to -patches, which is WORK IN PROGRESS.
[This patch matches discussion thread on -hackers.]
The following TODO items remain
1. discuss which process will issue regular XLogFlush(). If agreed,
implement WALWriter process to perform this task. (Yes, the patch isn't
fully implemented, yet).
2. remove fsync parameter
3. Prevent COMMIT NOWAIT when commit_delay = 0
4. Discuss whether commit_delay is OK to usurp; twas just an earlier
suggestion from someone else, can go either way.
5. docs
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Attachments:
commit_nowait.wip1.patchtext/x-patch; charset=UTF-8; name=commit_nowait.wip1.patchDownload
Index: src/backend/access/transam/xact.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xact.c,v
retrieving revision 1.234
diff -c -r1.234 xact.c
*** src/backend/access/transam/xact.c 9 Feb 2007 03:35:33 -0000 1.234
--- src/backend/access/transam/xact.c 26 Feb 2007 22:04:44 -0000
***************
*** 58,63 ****
--- 58,66 ----
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+ bool DefaultXactCommitWait = true;
+ bool XactCommitWait = true;
+
/*
* transaction states - transaction state from server perspective
***************
*** 789,795 ****
* Note: if we generated a commit record above, MyXactMadeXLogEntry
* will certainly be set now.
*/
! if (MyXactMadeXLogEntry)
{
/*
* Sleep before flush! So we can flush more than one commit
--- 792,798 ----
* Note: if we generated a commit record above, MyXactMadeXLogEntry
* will certainly be set now.
*/
! if (MyXactMadeXLogEntry && XactCommitWait)
{
/*
* Sleep before flush! So we can flush more than one commit
Index: src/backend/parser/gram.y
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/parser/gram.y,v
retrieving revision 2.580
diff -c -r2.580 gram.y
*** src/backend/parser/gram.y 20 Feb 2007 17:32:16 -0000 2.580
--- src/backend/parser/gram.y 26 Feb 2007 22:05:01 -0000
***************
*** 4813,4818 ****
--- 4813,4819 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_ROLLBACK;
n->options = NIL;
+ n->wait = true;
$$ = (Node *)n;
}
| BEGIN_P opt_transaction transaction_mode_list_or_empty
***************
*** 4820,4825 ****
--- 4821,4827 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_BEGIN;
n->options = $3;
+ n->wait = true;
$$ = (Node *)n;
}
| START TRANSACTION transaction_mode_list_or_empty
***************
*** 4827,4832 ****
--- 4829,4835 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_START;
n->options = $3;
+ n->wait = true;
$$ = (Node *)n;
}
| COMMIT opt_transaction
***************
*** 4834,4839 ****
--- 4837,4851 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_COMMIT;
n->options = NIL;
+ n->wait = true;
+ $$ = (Node *)n;
+ }
+ | COMMIT opt_transaction_write NOWAIT opt_transaction_immed
+ {
+ TransactionStmt *n = makeNode(TransactionStmt);
+ n->kind = TRANS_STMT_COMMIT;
+ n->options = NIL;
+ n->wait = false;
$$ = (Node *)n;
}
| END_P opt_transaction
***************
*** 4841,4846 ****
--- 4853,4859 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_COMMIT;
n->options = NIL;
+ n->wait = true;
$$ = (Node *)n;
}
| ROLLBACK opt_transaction
***************
*** 4848,4853 ****
--- 4861,4867 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_ROLLBACK;
n->options = NIL;
+ n->wait = true;
$$ = (Node *)n;
}
| SAVEPOINT ColId
***************
*** 4856,4861 ****
--- 4870,4876 ----
n->kind = TRANS_STMT_SAVEPOINT;
n->options = list_make1(makeDefElem("savepoint_name",
(Node *)makeString($2)));
+ n->wait = true;
$$ = (Node *)n;
}
| RELEASE SAVEPOINT ColId
***************
*** 4864,4869 ****
--- 4879,4885 ----
n->kind = TRANS_STMT_RELEASE;
n->options = list_make1(makeDefElem("savepoint_name",
(Node *)makeString($3)));
+ n->wait = true;
$$ = (Node *)n;
}
| RELEASE ColId
***************
*** 4872,4877 ****
--- 4888,4894 ----
n->kind = TRANS_STMT_RELEASE;
n->options = list_make1(makeDefElem("savepoint_name",
(Node *)makeString($2)));
+ n->wait = true;
$$ = (Node *)n;
}
| ROLLBACK opt_transaction TO SAVEPOINT ColId
***************
*** 4880,4885 ****
--- 4897,4903 ----
n->kind = TRANS_STMT_ROLLBACK_TO;
n->options = list_make1(makeDefElem("savepoint_name",
(Node *)makeString($5)));
+ n->wait = true;
$$ = (Node *)n;
}
| ROLLBACK opt_transaction TO ColId
***************
*** 4888,4893 ****
--- 4906,4912 ----
n->kind = TRANS_STMT_ROLLBACK_TO;
n->options = list_make1(makeDefElem("savepoint_name",
(Node *)makeString($4)));
+ n->wait = true;
$$ = (Node *)n;
}
| PREPARE TRANSACTION Sconst
***************
*** 4895,4900 ****
--- 4914,4920 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_PREPARE;
n->gid = $3;
+ n->wait = true;
$$ = (Node *)n;
}
| COMMIT PREPARED Sconst
***************
*** 4902,4907 ****
--- 4922,4928 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_COMMIT_PREPARED;
n->gid = $3;
+ n->wait = true;
$$ = (Node *)n;
}
| ROLLBACK PREPARED Sconst
***************
*** 4909,4914 ****
--- 4930,4936 ----
TransactionStmt *n = makeNode(TransactionStmt);
n->kind = TRANS_STMT_ROLLBACK_PREPARED;
n->gid = $3;
+ n->wait = true;
$$ = (Node *)n;
}
;
***************
*** 4918,4923 ****
--- 4940,4955 ----
| /*EMPTY*/ {}
;
+ opt_transaction_write:
+ WRITE {}
+ | /* EMPTY */ {}
+ ;
+
+ opt_transaction_immed:
+ IMMEDIATE {}
+ | /* EMPTY */ {}
+ ;
+
transaction_mode_item:
ISOLATION LEVEL iso_level
{ $$ = makeDefElem("transaction_isolation",
Index: src/backend/tcop/utility.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tcop/utility.c,v
retrieving revision 1.273
diff -c -r1.273 utility.c
*** src/backend/tcop/utility.c 20 Feb 2007 17:32:16 -0000 1.273
--- src/backend/tcop/utility.c 26 Feb 2007 22:05:12 -0000
***************
*** 432,437 ****
--- 432,444 ----
if (completionTag)
strcpy(completionTag, "ROLLBACK");
}
+ else
+ {
+ if (stmt->wait)
+ XactCommitWait = DefaultXactCommitWait;
+ else
+ XactCommitWait = false;
+ }
break;
case TRANS_STMT_PREPARE:
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.377
diff -c -r1.377 guc.c
*** src/backend/utils/misc/guc.c 23 Feb 2007 21:36:18 -0000 1.377
--- src/backend/utils/misc/guc.c 26 Feb 2007 22:05:21 -0000
***************
*** 871,876 ****
--- 871,884 ----
true, assign_phony_autocommit, NULL
},
{
+ {"commit_wait_default", PGC_USERSET, WAL_SETTINGS,
+ gettext_noop("Sets the default of wait-for-commit."),
+ NULL
+ },
+ &DefaultXactCommitWait,
+ true, NULL, NULL
+ },
+ {
{"default_transaction_read_only", PGC_USERSET, CLIENT_CONN_STATEMENT,
gettext_noop("Sets the default read-only status of new transactions."),
NULL
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.209
diff -c -r1.209 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample 16 Feb 2007 17:07:00 -0000 1.209
--- src/backend/utils/misc/postgresql.conf.sample 26 Feb 2007 22:05:22 -0000
***************
*** 161,166 ****
--- 161,167 ----
#full_page_writes = on # recover from partial page writes
#wal_buffers = 64kB # min 32kB
# (change requires restart)
+ #commit_wait_default = on # default wait-at-commit
#commit_delay = 0 # range 0-100000, in microseconds
#commit_siblings = 5 # range 1-1000
Index: src/include/access/xact.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xact.h,v
retrieving revision 1.84
diff -c -r1.84 xact.h
*** src/include/access/xact.h 5 Jan 2007 22:19:51 -0000 1.84
--- src/include/access/xact.h 26 Feb 2007 22:05:28 -0000
***************
*** 41,46 ****
--- 41,49 ----
extern bool DefaultXactReadOnly;
extern bool XactReadOnly;
+ extern bool DefaultXactCommitWait;
+ extern bool XactCommitWait;
+
/*
* start- and end-of-transaction callbacks for dynamically loaded modules
*/
Index: src/include/nodes/parsenodes.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/nodes/parsenodes.h,v
retrieving revision 1.341
diff -c -r1.341 parsenodes.h
*** src/include/nodes/parsenodes.h 20 Feb 2007 17:32:17 -0000 1.341
--- src/include/nodes/parsenodes.h 26 Feb 2007 22:05:30 -0000
***************
*** 1689,1694 ****
--- 1689,1695 ----
NodeTag type;
TransactionStmtKind kind; /* see above */
List *options; /* for BEGIN/START and savepoint commands */
+ bool wait; /* explicit NOWAIT or use default wait-at-commit */
char *gid; /* for two-phase-commit related commands */
} TransactionStmt;
"Simon Riggs" <simon@2ndquadrant.com> writes:
A prototype patch is posted to -patches, which is WORK IN PROGRESS.
[This patch matches discussion thread on -hackers.]
What does this accomplish other than adding syntactic sugar over a
feature that really doesn't work well anyway? I don't see any point
in encouraging people to use commit_delay in its present form. If we
had a portable solution for millisecond-or-so waits then maybe it would
work ...
regards, tom lane
On Mon, 2007-02-26 at 18:14 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
A prototype patch is posted to -patches, which is WORK IN PROGRESS.
[This patch matches discussion thread on -hackers.]What does this accomplish other than adding syntactic sugar over a
feature that really doesn't work well anyway? I don't see any point
in encouraging people to use commit_delay in its present form. If we
had a portable solution for millisecond-or-so waits then maybe it would
work ...
This patch doesn't intend to implement group commit. I've changed the
meaning of commit_delay, sorry if that confuses.
You and I discussed this in Toronto actually, IIRC. The best way to
describe this proposal is deferred fsync, so perhaps a different
parameter commit_fsync_delay would be more appropriate.
Bruce has requested this feature many times from me, so I thought it
about time to publish.
The key point is that COMMIT NOWAIT doesn't wait for group commit to
return, it just doesn't wait at all - leaving someone else to flush WAL.
It's much better than fsync=off.
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Mon, 2007-02-26 at 18:14 -0500, Tom Lane wrote:
What does this accomplish other than adding syntactic sugar over a
feature that really doesn't work well anyway?
This patch doesn't intend to implement group commit. I've changed the
meaning of commit_delay, sorry if that confuses.
Ah. The patch was pretty much unintelligible without the discussion
(which got here considerably later :-(). I've still got misgivings
about how safe it really is, but at least this is better than what
commit_delay wishes it could do.
regards, tom lane
On Mon, 2007-02-26 at 23:07 -0500, Tom Lane wrote:
"Simon Riggs" <simon@2ndquadrant.com> writes:
On Mon, 2007-02-26 at 18:14 -0500, Tom Lane wrote:
What does this accomplish other than adding syntactic sugar over a
feature that really doesn't work well anyway?This patch doesn't intend to implement group commit. I've changed the
meaning of commit_delay, sorry if that confuses.Ah. The patch was pretty much unintelligible without the discussion
(which got here considerably later :-(). I've still got misgivings
about how safe it really is, but at least this is better than what
commit_delay wishes it could do.
Latest WIP version of patch now ready for performance testing.
Applies cleanly to CVS HEAD, with two additional files:
src/backend/postmaster/walwriter.c
src/include/postmaster/walwriter.h
Patch passes make installcheck in these cases
- no options set
- wal_writer_delay = 100000
- wal_writer_delay = 100000 and transaction_guarantee = off for all
transactions by default in postgresql.conf.
Normal checkpoints and restarts work without problem after these runs.
What this patch does
--------------------
Implements unguaranteed transactions, which skip the XLogFlush step when
they commit. The flush point is updated in shared memory so that a
separate WAL writer process will perform the flush each time it cycles.
These parameters control this behaviour
transaction_guarantee = on (default) | off USERSET
wal_writer_delay = 0 (default, ==off) SIGHUP
log_transaction_guarantee = on (default) | off SIGHUP
(the default for this would be off in later production version)
WAL writer will start/stop when wal_writer_delay is non-zero/zero.
Unguaranteed transactions are only allowed for
- Execute message
- Fastpath message
- Sync message
- simple query implicit-commit-at-end and explicit COMMITs
All other transaction commits will always use guaranteed commit path.
These include things like VACUUM, various DDL and about a dozen other
places that execute commits. The abort path is never fast in any case.
In addition, any transaction that is deleting files follows guaranteed
commit path, however it was requested.
The interlock between commits and checkpoints is maintained. After the
CheckpointStartLock has been gained by bgwriter, all unguaranteed
transactions are flushed.
(In addition the fsync GUC has been removed from postgresql.conf.sample,
but not actually removed. If this patch goes ahead, I suggest we
deprecate it for one release then remove it next...)
What this patch doesn't do yet
------------------------------
Crash recovery does not yet work, but can be made to do so with TODO
items (1) and (2) below.
1. The interlock between buffer manager and WAL is maintained, but not
sufficiently to avoid problems in all cases. Specifically, commit hint
bits must not be written to disk ahead of a transaction commit.
Two approaches are possible
1. avoid setting the hint bits for unguaranteed transactions
2. set the hint bits *and* update the LSN of the page to be the LSN of
the unguaranteed transaction for which we are setting the hint bits.
Either way, we need to maintain a list of unguaranteed transactions in
shared memory that can be accessed when hint bits are set. The list
would need to contain the Xid and the LSN of each unguaranteed
transaction. This would necessitate keeping the list of unguaranteed
transactions fairly small, so some care is required to ensure this. That
can be achieved by keeping commit_fsync_delay small or putting in a
trigger point at which an wannabe unguaranteed transaction is forced to
flush WAL instead. Some testing has shown that committing every 8
transactions has a considerable leap in performance in many cases.
2. As originally discussed, during crash recovery any in-flight
transactions would need to be explicitly aborted in clog, to override
the possibility that an unguaranteed transaction would have been marked
committed. An alternative would be to flush all unguaranteed
transactions prior to flushing dirty clog and multitrans pages. That
could be achieved by keeping the LSN of the last write to those pages
and performing XLogFlush up to that LSN when we write dirty pages. I'm
leaning towards the new alternative version now, since its cleaner and
it fits better with the way the rest of the server works.
3. WAL Writer could be used for various additional tasks, such as doing
the WAL cache-half-filled check. Those options have been ignored until
now, to avoid complicating discussion and review.
4. We probably need more padding in XLogCtlData to ensure that data
protected by WALInsertLock, WALWriteLock and infolck are in separate
cache lines to avoid CPU false sharing. That should be done whether or
not this patch goes ahead.
Tests, reviews and comments please?
--
Simon Riggs
EnterpriseDB http://www.enterprisedb.com
Attachments:
transaction_guarantee.v6.patchtext/x-patch; charset=UTF-8; name=transaction_guarantee.v6.patchDownload
Index: src/backend/access/transam/xact.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xact.c,v
retrieving revision 1.234
diff -c -r1.234 xact.c
*** src/backend/access/transam/xact.c 9 Feb 2007 03:35:33 -0000 1.234
--- src/backend/access/transam/xact.c 11 Mar 2007 22:37:09 -0000
***************
*** 58,63 ****
--- 58,66 ----
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+ bool DefaultXactCommitGuarantee = true; /* USERSET GUC: what user wants */
+ static bool XactCommitGuarantee = true; /* the guarantee for this Xid? */
+ bool log_transaction_guarantee = true;
/*
* transaction states - transaction state from server perspective
***************
*** 710,715 ****
--- 713,719 ----
TransactionId xid = GetCurrentTransactionId();
bool madeTCentries;
XLogRecPtr recptr;
+ bool unsafe = false;
/* Tell bufmgr and smgr to prepare for commit */
BufmgrCommit();
***************
*** 792,812 ****
if (MyXactMadeXLogEntry)
{
/*
! * Sleep before flush! So we can flush more than one commit
! * records per single fsync. (The idea is some other backend may
! * do the XLogFlush while we're sleeping. This needs work still,
! * because on most Unixen, the minimum select() delay is 10msec or
! * more, which is way too long.)
! *
! * We do not sleep if enableFsync is not turned on, nor if there
! * are fewer than CommitSiblings other backends with active
! * transactions.
! */
! if (CommitDelay > 0 && enableFsync &&
! CountActiveBackends() >= CommitSiblings)
! pg_usleep(CommitDelay);
! XLogFlush(recptr);
}
/*
--- 796,830 ----
if (MyXactMadeXLogEntry)
{
/*
! * If we have chosen to use unguaranteed transactions and we're
! * not doing cleanup of any rels, then we can defer fsync.
! * The WAL writer acts to minimise the window of data loss,
! * and we rely on it to flush WAL soon, but not precisely now.
! */
! if (XactCommitGuarantee || nrels > 0)
! {
! /*
! * Sleep before flush! So we can flush more than one commit
! * records per single fsync. (The idea is some other backend may
! * do the XLogFlush while we're sleeping. This needs work still,
! * because on most Unixen, the minimum select() delay is 10msec or
! * more, which is way too long.)
! *
! * We do not sleep if enableFsync is not turned on, nor if there
! * are fewer than CommitSiblings other backends with active
! * transactions.
! */
! if (CommitDelay > 0 && enableFsync &&
! CountActiveBackends() >= CommitSiblings)
! pg_usleep(CommitDelay);
! XLogFlush(recptr);
! }
! else
! {
! unsafe = true;
! XLogDeferredFlush(recptr);
! }
}
/*
***************
*** 830,835 ****
--- 848,858 ----
LWLockRelease(CheckpointStartLock);
END_CRIT_SECTION();
+
+ if (log_transaction_guarantee && madeTCentries && WALWriterActive())
+ elog(LOG,"COMMIT %s insert %X/%X",
+ (XactCommitGuarantee ? " safe" : "unsafe"),
+ recptr.xlogid, recptr.xrecoff);
}
/* Break the chain of back-links in the XLOG records I output */
***************
*** 1388,1393 ****
--- 1411,1417 ----
FreeXactSnapshot();
XactIsoLevel = DefaultXactIsoLevel;
XactReadOnly = DefaultXactReadOnly;
+ SetXactCommitGuarantee(true);
/*
* reinitialize within-transaction counters
***************
*** 4092,4097 ****
--- 4116,4127 ----
return "UNRECOGNIZED";
}
+ void
+ SetXactCommitGuarantee(bool RequestedXactCommitGuarantee)
+ {
+ XactCommitGuarantee = RequestedXactCommitGuarantee;
+ }
+
/*
* xactGetCommittedChildren
*
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.265
diff -c -r1.265 xlog.c
*** src/backend/access/transam/xlog.c 3 Mar 2007 20:02:26 -0000 1.265
--- src/backend/access/transam/xlog.c 11 Mar 2007 22:37:15 -0000
***************
*** 301,306 ****
--- 301,309 ----
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
+ /* Protected by commit_lck: */
+ XLogwrtRqst CommitLogwrtRqst;
+
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. Permission to read/write the pages
***************
*** 313,318 ****
--- 316,322 ----
TimeLineID ThisTimeLineID;
slock_t info_lck; /* locks shared variables shown above */
+ slock_t commit_lck; /* deferred commit lock */
} XLogCtlData;
static XLogCtlData *XLogCtl = NULL;
***************
*** 1787,1792 ****
--- 1791,1851 ----
}
/*
+ * XLogDeferredFlush
+ *
+ * Keep track of deferred flush requests by unguaranteed transaction commits
+ */
+ void
+ XLogDeferredFlush(XLogRecPtr RecPtr)
+ {
+ /*
+ * Update the deferred commit request pointer, if required, then
+ * return quickly so we can do some other useful work
+ */
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->commit_lck);
+ if (!XLByteLE(xlogctl->CommitLogwrtRqst.Write, RecPtr))
+ xlogctl->CommitLogwrtRqst.Write = RecPtr;
+ SpinLockRelease(&xlogctl->commit_lck);
+ }
+
+ /* Note that there is *no* XLogFlush() here, by design */
+ }
+
+ /*
+ * XLogBackgroundFlush
+ *
+ * Flush as far as the deferred commit request pointer, so that all
+ * unguaranteed commits are known flushed after this returns.
+ *
+ * If it hasn't changed or a normal commit has flushed past our pointer
+ * we will exit quickly from XLogFlush(), so no extra code here
+ */
+ void
+ XLogBackgroundFlush(void)
+ {
+ XLogRecPtr RecPtr;
+
+ /*
+ * Get the current deferred commit request pointer,
+ * don't worry about keeping local state information
+ */
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->commit_lck);
+ RecPtr = xlogctl->CommitLogwrtRqst.Write;
+ SpinLockRelease(&xlogctl->commit_lck);
+ }
+
+ XLogFlush(RecPtr);
+ }
+
+ /*
* Create a new XLOG file segment, or open a pre-existing one.
*
* log, seg: identify segment to be created/opened.
***************
*** 3985,3990 ****
--- 4044,4050 ----
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
+ SpinLockInit(&XLogCtl->commit_lck);
/*
* If we are not in bootstrap mode, pg_control should already exist. Read
***************
*** 4998,5003 ****
--- 5058,5065 ----
XLogCtl->LogwrtRqst.Write = EndOfLog;
XLogCtl->LogwrtRqst.Flush = EndOfLog;
+ XLogCtl->CommitLogwrtRqst.Write = EndOfLog;
+ XLogCtl->CommitLogwrtRqst.Flush = EndOfLog;
freespace = INSERT_FREESPACE(Insert);
if (freespace > 0)
***************
*** 5389,5394 ****
--- 5451,5463 ----
*/
LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
+ /*
+ * Now confirm that all unguaranteed transactions are written to WAL
+ * before we proceed further. This may require WALWriteLock and possibly
+ * WALInsertLock if we need to flush.
+ */
+ XLogBackgroundFlush();
+
/* And we need WALInsertLock too */
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
Index: src/backend/postmaster/Makefile
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/Makefile,v
retrieving revision 1.22
diff -c -r1.22 Makefile
*** src/backend/postmaster/Makefile 20 Jan 2007 17:16:12 -0000 1.22
--- src/backend/postmaster/Makefile 11 Mar 2007 22:37:17 -0000
***************
*** 12,18 ****
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = bgwriter.o autovacuum.o pgarch.o pgstat.o postmaster.o syslogger.o \
fork_process.o
all: SUBSYS.o
--- 12,18 ----
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
! OBJS = bgwriter.o walwriter.o autovacuum.o pgarch.o pgstat.o postmaster.o syslogger.o \
fork_process.o
all: SUBSYS.o
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.526
diff -c -r1.526 postmaster.c
*** src/backend/postmaster/postmaster.c 7 Mar 2007 13:35:02 -0000 1.526
--- src/backend/postmaster/postmaster.c 11 Mar 2007 22:37:21 -0000
***************
*** 107,112 ****
--- 107,113 ----
#include "postmaster/pgarch.h"
#include "postmaster/postmaster.h"
#include "postmaster/syslogger.h"
+ #include "postmaster/walwriter.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
***************
*** 201,206 ****
--- 202,208 ----
/* PIDs of special child processes; 0 when not running */
static pid_t StartupPID = 0,
BgWriterPID = 0,
+ WALWriterPID = 0,
AutoVacPID = 0,
PgArchPID = 0,
PgStatPID = 0;
***************
*** 907,913 ****
* CAUTION: when changing this list, check for side-effects on the signal
* handling setup of child processes. See tcop/postgres.c,
* bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/autovacuum.c,
! * postmaster/pgarch.c, postmaster/pgstat.c, and postmaster/syslogger.c.
*/
pqinitmask();
PG_SETMASK(&BlockSig);
--- 909,916 ----
* CAUTION: when changing this list, check for side-effects on the signal
* handling setup of child processes. See tcop/postgres.c,
* bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/autovacuum.c,
! * postmaster/pgarch.c, postmaster/pgstat.c, postmaster/syslogger.c
! * and postmaster/walwriter.c
*/
pqinitmask();
PG_SETMASK(&BlockSig);
***************
*** 1250,1255 ****
--- 1253,1263 ----
start_autovac_launcher = false; /* signal successfully processed */
}
+ /* If we have lost the WAL writer, try to start a new one */
+ if (WALWriterActive() && WALWriterPID == 0 &&
+ StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
+ WALWriterPID = StartWALWriter();
+
/* If we have lost the archiver, try to start a new one */
if (XLogArchivingActive() && PgArchPID == 0 &&
StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
***************
*** 1822,1827 ****
--- 1830,1837 ----
signal_child(BgWriterPID, SIGHUP);
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGHUP);
+ if (WALWriterPID != 0)
+ signal_child(WALWriterPID, SIGHUP);
if (PgArchPID != 0)
signal_child(PgArchPID, SIGHUP);
if (SysLoggerPID != 0)
***************
*** 1891,1896 ****
--- 1901,1909 ----
/* And tell it to shut down */
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGUSR2);
+ /* Tell WALWriter to shut down too; nothing left for it to do */
+ if (WALWriterPID != 0)
+ signal_child(WALWriterPID, SIGQUIT);
/* Tell pgarch to shut down too; nothing left for it to do */
if (PgArchPID != 0)
signal_child(PgArchPID, SIGQUIT);
***************
*** 1947,1952 ****
--- 1960,1968 ----
/* And tell it to shut down */
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGUSR2);
+ /* Tell WALWriter to shut down too; nothing left for it to do */
+ if (WALWriterPID != 0)
+ signal_child(WALWriterPID, SIGQUIT);
/* Tell pgarch to shut down too; nothing left for it to do */
if (PgArchPID != 0)
signal_child(PgArchPID, SIGQUIT);
***************
*** 1972,1977 ****
--- 1988,1995 ----
signal_child(StartupPID, SIGQUIT);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGQUIT);
+ if (WALWriterPID != 0)
+ signal_child(WALWriterPID, SIGQUIT);
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGQUIT);
if (PgArchPID != 0)
***************
*** 2070,2077 ****
/*
* Go to shutdown mode if a shutdown request was pending.
! * Otherwise, try to start the archiver, stats collector and
! * autovacuum launcher.
*/
if (Shutdown > NoShutdown && BgWriterPID != 0)
signal_child(BgWriterPID, SIGUSR2);
--- 2088,2095 ----
/*
* Go to shutdown mode if a shutdown request was pending.
! * Otherwise, try to start the archiver, stats collector,
! * autovacuum launcher and WALWriter.
*/
if (Shutdown > NoShutdown && BgWriterPID != 0)
signal_child(BgWriterPID, SIGUSR2);
***************
*** 2081,2086 ****
--- 2099,2106 ----
PgArchPID = pgarch_start();
if (PgStatPID == 0)
PgStatPID = pgstat_start();
+ if (WALWriterPID == 0)
+ WALWriterPID = StartWALWriter();
if (AutoVacuumingActive() && AutoVacPID == 0)
AutoVacPID = StartAutoVacLauncher();
***************
*** 2141,2146 ****
--- 2161,2180 ----
}
/*
+ * Was it the WALWriter? Normal exit can be ignored; we'll
+ * start a new one at the next iteration of the postmaster's main loop,
+ * if necessary. Any other exit condition is treated as a crash.
+ */
+ if (WALWriterPID != 0 && pid == WALWriterPID)
+ {
+ WALWriterPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ HandleChildCrash(pid, exitstatus,
+ _("WALWriter process"));
+ continue;
+ }
+
+ /*
* Was it the autovacuum launcher? Normal exit can be ignored; we'll
* start a new one at the next iteration of the postmaster's main loop,
* if necessary. Any other exit condition is treated as a crash.
***************
*** 2236,2241 ****
--- 2270,2278 ----
/* And tell it to shut down */
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGUSR2);
+ /* Tell WALWriter to shut down too; nothing left for it to do */
+ if (WALWriterPID != 0)
+ signal_child(WALWriterPID, SIGQUIT);
/* Tell pgarch to shut down too; nothing left for it to do */
if (PgArchPID != 0)
signal_child(PgArchPID, SIGQUIT);
***************
*** 2384,2389 ****
--- 2421,2437 ----
signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
}
+ /* Force a power-cycle of the WALWriter process too */
+ /* (Shouldn't be necessary, but just for luck) */
+ if (WALWriterPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ "SIGQUIT",
+ (int) WALWriterPID)));
+ signal_child(WALWriterPID, SIGQUIT);
+ }
+
/* Force a power-cycle of the pgarch process too */
/* (Shouldn't be necessary, but just for luck) */
if (PgArchPID != 0 && !FatalError)
***************
*** 3475,3480 ****
--- 3523,3545 ----
AutoVacWorkerMain(argc - 2, argv + 2);
proc_exit(0);
}
+ if (strcmp(argv[1], "--forkwalwriter") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Restore basic shared memory pointers */
+ InitShmemAccess(UsedShmemSegAddr);
+
+ /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+ InitProcess();
+
+ /* Attach process to shared data structures */
+ CreateSharedMemoryAndSemaphores(false, 0);
+
+ WALWriterMain(argc, argv);
+ proc_exit(0);
+ }
if (strcmp(argv[1], "--forkarch") == 0)
{
/* Close the postmaster's sockets */
Index: src/backend/tcop/postgres.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tcop/postgres.c,v
retrieving revision 1.527
diff -c -r1.527 postgres.c
*** src/backend/tcop/postgres.c 3 Mar 2007 19:32:54 -0000 1.527
--- src/backend/tcop/postgres.c 11 Mar 2007 22:37:24 -0000
***************
*** 2224,2229 ****
--- 2224,2231 ----
ereport(DEBUG3,
(errmsg_internal("CommitTransactionCommand")));
+ SetXactCommitGuarantee(DefaultXactCommitGuarantee);
+
CommitTransactionCommand();
#ifdef MEMORY_CONTEXT_CHECKING
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.379
diff -c -r1.379 guc.c
*** src/backend/utils/misc/guc.c 6 Mar 2007 02:06:14 -0000 1.379
--- src/backend/utils/misc/guc.c 11 Mar 2007 22:37:31 -0000
***************
*** 52,57 ****
--- 52,58 ----
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
#include "postmaster/syslogger.h"
+ #include "postmaster/walwriter.h"
#include "storage/fd.h"
#include "storage/freespace.h"
#include "tcop/tcopprot.h"
***************
*** 100,105 ****
--- 101,107 ----
extern int CommitSiblings;
extern char *default_tablespace;
extern bool fullPageWrites;
+ extern bool log_transaction_guarantee;
#ifdef TRACE_SORT
extern bool trace_sort;
***************
*** 145,150 ****
--- 147,153 ----
static bool assign_stage_log_stats(bool newval, bool doit, GucSource source);
static bool assign_log_stats(bool newval, bool doit, GucSource source);
static bool assign_transaction_read_only(bool newval, bool doit, GucSource source);
+ static bool assign_transaction_guarantee(bool newval, bool doit, GucSource source);
static const char *assign_canonical_path(const char *newval, bool doit, GucSource source);
static const char *assign_backslash_quote(const char *newval, bool doit, GucSource source);
static const char *assign_timezone_abbreviations(const char *newval, bool doit, GucSource source);
***************
*** 312,317 ****
--- 315,322 ----
gettext_noop("Write-Ahead Log"),
/* WAL_SETTINGS */
gettext_noop("Write-Ahead Log / Settings"),
+ /* WAL_COMMITS */
+ gettext_noop("Write-Ahead Log / Commit Behavior"),
/* WAL_CHECKPOINTS */
gettext_noop("Write-Ahead Log / Checkpoints"),
/* QUERY_TUNING */
***************
*** 568,573 ****
--- 573,586 ----
false, NULL, NULL
},
{
+ {"log_transaction_guarantee", PGC_SIGHUP, WAL_COMMITS,
+ gettext_noop("Logs form of guarantee used at transaction commit."),
+ NULL
+ },
+ &log_transaction_guarantee,
+ true, NULL, NULL
+ },
+ {
{"log_connections", PGC_BACKEND, LOGGING_WHAT,
gettext_noop("Logs each successful connection."),
NULL
***************
*** 878,883 ****
--- 891,904 ----
true, assign_phony_autocommit, NULL
},
{
+ {"transaction_guarantee", PGC_USERSET, WAL_COMMITS,
+ gettext_noop("Sets the default of wait-for-commit."),
+ NULL
+ },
+ &DefaultXactCommitGuarantee,
+ true, assign_transaction_guarantee, NULL
+ },
+ {
{"default_transaction_read_only", PGC_USERSET, CLIENT_CONN_STATEMENT,
gettext_noop("Sets the default read-only status of new transactions."),
NULL
***************
*** 1452,1458 ****
},
{
! {"commit_delay", PGC_USERSET, WAL_CHECKPOINTS,
gettext_noop("Sets the delay in microseconds between transaction commit and "
"flushing WAL to disk."),
NULL
--- 1473,1479 ----
},
{
! {"commit_delay", PGC_USERSET, WAL_COMMITS,
gettext_noop("Sets the delay in microseconds between transaction commit and "
"flushing WAL to disk."),
NULL
***************
*** 1462,1468 ****
},
{
! {"commit_siblings", PGC_USERSET, WAL_CHECKPOINTS,
gettext_noop("Sets the minimum concurrent open transactions before performing "
"commit_delay."),
NULL
--- 1483,1489 ----
},
{
! {"commit_siblings", PGC_USERSET, WAL_COMMITS,
gettext_noop("Sets the minimum concurrent open transactions before performing "
"commit_delay."),
NULL
***************
*** 1472,1477 ****
--- 1493,1507 ----
},
{
+ {"wal_writer_delay", PGC_SIGHUP, WAL_COMMITS,
+ gettext_noop("Sets the delay in microseconds between regular flushing of WAL "
+ "to disk by the WALWriter."),
+ NULL
+ },
+ &WALWriterDelay,
+ 0, 0, 10000000, NULL, NULL
+ },
+ {
{"extra_float_digits", PGC_USERSET, CLIENT_CONN_LOCALE,
gettext_noop("Sets the number of digits displayed for floating-point values."),
gettext_noop("This affects real, double precision, and geometric data types. "
***************
*** 6430,6435 ****
--- 6460,6484 ----
return true;
}
+ static bool
+ assign_transaction_guarantee(bool newval, bool doit, GucSource source)
+ {
+ /*
+ * Transaction guarantee can only be disabled if the
+ * WALWriter has been activated, allowing us to place
+ * a sensible time limit on the extent of the data loss window
+ * for UnGuaranteed Transactions
+ */
+ if (newval == false && !WALWriterActive())
+ {
+ if (source >= PGC_S_INTERACTIVE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot set transaction guarantee when server commit_fsync_delay = 0")));
+ }
+ return true;
+ }
+
static const char *
assign_canonical_path(const char *newval, bool doit, GucSource source)
{
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.212
diff -c -r1.212 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample 6 Mar 2007 02:06:14 -0000 1.212
--- src/backend/utils/misc/postgresql.conf.sample 11 Mar 2007 22:37:31 -0000
***************
*** 150,156 ****
# - Settings -
! #fsync = on # turns forced synchronization on or off
#wal_sync_method = fsync # the default is the first option
# supported by the operating system:
# open_datasync
--- 150,156 ----
# - Settings -
! #wal_writer_delay = 0 # range 0-10000000, in microseconds
#wal_sync_method = fsync # the default is the first option
# supported by the operating system:
# open_datasync
***************
*** 161,169 ****
--- 161,172 ----
#full_page_writes = on # recover from partial page writes
#wal_buffers = 64kB # min 32kB
# (change requires restart)
+
#commit_delay = 0 # range 0-100000, in microseconds
#commit_siblings = 5 # range 1-1000
+ #transaction_guarantee = on # default: immediate fsync at commit
+
# - Checkpoints -
#checkpoint_segments = 3 # in logfile segments, min 1, 16MB each
Index: src/include/access/xact.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xact.h,v
retrieving revision 1.84
diff -c -r1.84 xact.h
*** src/include/access/xact.h 5 Jan 2007 22:19:51 -0000 1.84
--- src/include/access/xact.h 11 Mar 2007 22:37:33 -0000
***************
*** 16,21 ****
--- 16,22 ----
#include "access/xlog.h"
#include "nodes/pg_list.h"
+ #include "postmaster/walwriter.h"
#include "storage/relfilenode.h"
#include "utils/timestamp.h"
***************
*** 41,46 ****
--- 42,50 ----
extern bool DefaultXactReadOnly;
extern bool XactReadOnly;
+ /* Deferred Fsync */
+ extern bool DefaultXactCommitGuarantee;
+ extern void SetXactCommitGuarantee(bool RequestedXactCommitGuarantee);
/*
* start- and end-of-transaction callbacks for dynamically loaded modules
*/
Index: src/include/access/xlog.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h 5 Jan 2007 22:19:51 -0000 1.76
--- src/include/access/xlog.h 11 Mar 2007 22:37:33 -0000
***************
*** 151,156 ****
--- 151,158 ----
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
extern void XLogFlush(XLogRecPtr RecPtr);
+ extern void XLogDeferredFlush(XLogRecPtr RecPtr);
+ extern void XLogBackgroundFlush(void);
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
Index: src/include/utils/guc_tables.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/utils/guc_tables.h,v
retrieving revision 1.30
diff -c -r1.30 guc_tables.h
*** src/include/utils/guc_tables.h 5 Jan 2007 22:19:59 -0000 1.30
--- src/include/utils/guc_tables.h 11 Mar 2007 22:37:34 -0000
***************
*** 51,56 ****
--- 51,57 ----
RESOURCES_KERNEL,
WAL,
WAL_SETTINGS,
+ WAL_COMMITS,
WAL_CHECKPOINTS,
QUERY_TUNING,
QUERY_TUNING_METHOD,