Index: doc/src/sgml/config.sgml
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/doc/src/sgml/config.sgml,v
retrieving revision 1.126
diff -c -r1.126 config.sgml
*** doc/src/sgml/config.sgml	7 Jun 2007 19:19:56 -0000	1.126
--- doc/src/sgml/config.sgml	12 Jun 2007 08:16:55 -0000
***************
*** 1565,1570 ****
--- 1565,1619 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-checkpoint-write-percent" xreflabel="checkpoint_write_percent">
+       <term><varname>checkpoint_write_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_write_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To spread works in checkpoints, each checkpoint spends the specified
+         time and delays to write out all dirty buffers in the shared buffer
+         pool. The default value is 50.0 (50% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
+      <varlistentry id="guc-checkpoint-nap-percent" xreflabel="checkpoint_nap_percent">
+       <term><varname>checkpoint_nap_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_nap_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the delay between writing out all dirty buffers and flushing
+         all modified files. Make the kernel's disk writer to flush dirty buffers
+         during this time in order to reduce works in the next flushing phase.
+         The default value is 10.0 (10% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
+      <varlistentry id="guc-checkpoint-sync-percent" xreflabel="checkpoint_sync_percent">
+       <term><varname>checkpoint_sync_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_sync_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To spread works in checkpoints, each checkpoint spends the specified
+         time and delays to flush all modified files.
+         The default value is 20.0 (20% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
        <term><varname>checkpoint_warning</varname> (<type>integer</type>)</term>
        <indexterm>
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.272
diff -c -r1.272 xlog.c
*** src/backend/access/transam/xlog.c	31 May 2007 15:13:01 -0000	1.272
--- src/backend/access/transam/xlog.c	12 Jun 2007 08:16:55 -0000
***************
*** 398,404 ****
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
--- 398,404 ----
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
***************
*** 5319,5324 ****
--- 5319,5341 ----
  }
  
  /*
+  * GetInsertRecPtr -- Returns the current insert position.
+  */
+ XLogRecPtr
+ GetInsertRecPtr(void)
+ {
+ 	volatile XLogCtlData *xlogctl = XLogCtl;
+ 	XLogCtlInsert  *Insert = &XLogCtl->Insert;
+ 	XLogRecPtr		recptr;
+ 
+ 	SpinLockAcquire(&xlogctl->info_lck);
+ 	INSERT_RECPTR(recptr, Insert, Insert->curridx);
+ 	SpinLockRelease(&xlogctl->info_lck);
+ 
+ 	return recptr;
+ }
+ 
+ /*
   * Get the time of the last xlog segment switch
   */
  time_t
***************
*** 5591,5597 ****
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo);
  
  	START_CRIT_SECTION();
  
--- 5608,5614 ----
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo, force);
  
  	START_CRIT_SECTION();
  
***************
*** 5697,5708 ****
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool();			/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
--- 5714,5725 ----
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool(immediate);		/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
***************
*** 5751,5757 ****
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
--- 5768,5774 ----
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, true);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
Index: src/backend/commands/dbcommands.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/dbcommands.c,v
retrieving revision 1.195
diff -c -r1.195 dbcommands.c
*** src/backend/commands/dbcommands.c	1 Jun 2007 19:38:07 -0000	1.195
--- src/backend/commands/dbcommands.c	12 Jun 2007 08:16:55 -0000
***************
*** 404,410 ****
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync();
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
--- 404,410 ----
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync(true);
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
***************
*** 1427,1433 ****
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync();
  
  		/*
  		 * Copy this subdirectory to the new location
--- 1427,1433 ----
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync(true);
  
  		/*
  		 * Copy this subdirectory to the new location
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.38
diff -c -r1.38 bgwriter.c
*** src/backend/postmaster/bgwriter.c	27 May 2007 03:50:39 -0000	1.38
--- src/backend/postmaster/bgwriter.c	12 Jun 2007 11:23:57 -0000
***************
*** 44,49 ****
--- 44,50 ----
  #include "postgres.h"
  
  #include <signal.h>
+ #include <sys/time.h>
  #include <time.h>
  #include <unistd.h>
  
***************
*** 117,122 ****
--- 118,124 ----
  	sig_atomic_t ckpt_failed;	/* advances when checkpoint fails */
  
  	sig_atomic_t ckpt_time_warn;	/* warn if too soon since last ckpt? */
+ 	sig_atomic_t ckpt_force;		/* any waiter for the checkpoint? */
  
  	int			num_requests;	/* current # of requests */
  	int			max_requests;	/* allocated array size */
***************
*** 131,136 ****
--- 133,139 ----
  int			BgWriterDelay = 200;
  int			CheckPointTimeout = 300;
  int			CheckPointWarning = 30;
+ double		checkpoint_write_percent = 50.0;
  
  /*
   * Flags set by interrupt handlers for later service in the main loop.
***************
*** 146,155 ****
--- 149,166 ----
  
  static bool ckpt_active = false;
  
+ static time_t		ckpt_start_time;
+ static XLogRecPtr	ckpt_start_recptr;
+ static double		ckpt_progress_at_sync_start;
+ 
  static time_t last_checkpoint_time;
  static time_t last_xlog_switch_time;
  
  
+ static void CheckArchiveTimeout(void);
+ static void BgWriterNap(long msec);
+ static bool NextCheckpointRequested(void);
+ static double GetCheckpointElapsedProgress(void);
  static void bg_quickdie(SIGNAL_ARGS);
  static void BgSigHupHandler(SIGNAL_ARGS);
  static void ReqCheckpointHandler(SIGNAL_ARGS);
***************
*** 331,337 ****
  		bool		force_checkpoint = false;
  		time_t		now;
  		int			elapsed_secs;
- 		long		udelay;
  
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
--- 342,347 ----
***************
*** 350,362 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			force_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
  		if (shutdown_requested)
  		{
  			/*
--- 360,365 ----
***************
*** 377,387 ****
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
  			do_checkpoint = true;
! 			if (!force_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
--- 380,396 ----
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (checkpoint_requested)
! 		{
! 			checkpoint_requested = false;
! 			force_checkpoint = BgWriterShmem->ckpt_force;
! 			do_checkpoint = true;
! 			BgWriterStats.m_requested_checkpoints++;
! 		}
! 		else if (elapsed_secs >= CheckPointTimeout)
  		{
  			do_checkpoint = true;
! 			BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
***************
*** 404,416 ****
--- 413,430 ----
  								elapsed_secs),
  						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  			BgWriterShmem->ckpt_time_warn = false;
+ 			BgWriterShmem->ckpt_force = false;
  
  			/*
  			 * Indicate checkpoint start to any waiting backends.
  			 */
  			ckpt_active = true;
+ 			elog(DEBUG1, "CHECKPOINT: start");
  			BgWriterShmem->ckpt_started++;
  
+ 			ckpt_start_time = now;
+ 			ckpt_start_recptr = GetInsertRecPtr();
+ 			ckpt_progress_at_sync_start = 0;
  			CreateCheckPoint(false, force_checkpoint);
  
  			/*
***************
*** 423,428 ****
--- 437,443 ----
  			 * Indicate checkpoint completion to any waiting backends.
  			 */
  			BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+ 			elog(DEBUG1, "CHECKPOINT: end");
  			ckpt_active = false;
  
  			/*
***************
*** 439,446 ****
  		 * Check for archive_timeout, if so, switch xlog files.  First we do a
  		 * quick check using possibly-stale local state.
  		 */
! 		if (XLogArchiveTimeout > 0 &&
! 			(int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
--- 454,481 ----
  		 * Check for archive_timeout, if so, switch xlog files.  First we do a
  		 * quick check using possibly-stale local state.
  		 */
! 		CheckArchiveTimeout();
! 
! 		/* Nap for the configured time. */
! 		BgWriterNap(0);
! 	}
! }
! 
! /*
!  * CheckArchiveTimeout -- check for archive_timeout
!  */
! static void
! CheckArchiveTimeout(void)
! {
! 	time_t		now;
! 
! 	if (XLogArchiveTimeout <= 0)
! 		return;
! 
! 	now = time(NULL);
! 	if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
! 		return;
! 
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
***************
*** 450,459 ****
  
  			last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
  
- 			/* if we did a checkpoint, 'now' might be stale too */
- 			if (do_checkpoint)
- 				now = time(NULL);
- 
  			/* Now we can do the real check */
  			if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  			{
--- 485,490 ----
***************
*** 478,483 ****
--- 509,526 ----
  				last_xlog_switch_time = now;
  			}
  		}
+ }
+ 
+ /*
+  * BgWriterNap -- short nap in bgwriter
+  *
+  * Nap for the shorter time of the configured time or the mdelay unless
+  * it is zero. Return the actual nap time in msec.
+  */
+ static void
+ BgWriterNap(long mdelay)
+ {
+ 	long		udelay;
  
  		/*
  		 * Send off activity statistics to the stats collector
***************
*** 503,508 ****
--- 546,555 ----
  		else
  			udelay = 10000000L; /* Ten seconds */
  
+ 		/* Clamp the delay to the upper bound. */
+ 		if (mdelay > 0)
+ 			udelay = Min(udelay, mdelay * 1000L);
+ 
  		while (udelay > 999999L)
  		{
  			if (got_SIGHUP || checkpoint_requested || shutdown_requested)
***************
*** 514,522 ****
--- 561,664 ----
  
  		if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
  			pg_usleep(udelay);
+ }
+ 
+ /*
+  * CheckpointWriteDelay -- periodical sleep in checkpoint write phase
+  */
+ void
+ CheckpointWriteDelay(double progress)
+ {
+ 	double target_progress;
+ 	bool next_requested;
+ 
+ 	if (!ckpt_active || checkpoint_write_percent <= 0)
+ 		return;
+ 
+ 	next_requested = NextCheckpointRequested();
+ 	target_progress = GetCheckpointElapsedProgress() / (checkpoint_write_percent / 100);
+ 
+ 	elog(DEBUG1, "CheckpointWriteDelay: progress=%.3f, target=%.3f, next=%d",
+ 		 progress, target_progress, next_requested);
+ 
+ 	if (!next_requested &&
+ 		progress > target_progress)
+ 	{
+ 		AbsorbFsyncRequests();
+ 		BgLruBufferSync();
+ 		BgWriterNap(0);
  	}
  }
  
+ /*
+  * NextCheckpointRequested -- true iff the next checkpoint is requested
+  *
+  *	Do also check any signals received recently.
+  */
+ static bool
+ NextCheckpointRequested(void)
+ {
+ 	if (!am_bg_writer || !ckpt_active)
+ 		return true;
+ 
+ 	/* Don't sleep this checkpoint if next checkpoint is requested. */
+ 	if (checkpoint_requested || shutdown_requested ||
+ 		(time(NULL) - ckpt_start_time >= CheckPointTimeout))
+ 	{
+ 		elog(DEBUG1, "NextCheckpointRequested");
+ 		checkpoint_requested = true;
+ 		return true;
+ 	}
+ 
+ 	/* Process reload signals. */
+ 	if (got_SIGHUP)
+ 	{
+ 		got_SIGHUP = false;
+ 		ProcessConfigFile(PGC_SIGHUP);
+ 	}
+ 
+ 	/* Check for archive_timeout and nap for the configured time. */
+ 	CheckArchiveTimeout();
+ 
+ 	return false;
+ }
+ 
+ /*
+  * GetCheckpointElapsedProgress -- progress of the current checkpoint in range 0-100%
+  */
+ static double
+ GetCheckpointElapsedProgress(void)
+ {
+ 	struct timeval	now;
+ 	XLogRecPtr		recptr;
+ 	double			progress_in_time,
+ 					progress_in_xlog;
+ 	double			percent;
+ 
+ 	Assert(ckpt_active);
+ 
+ 	/* coordinate the progress with checkpoint_timeout */
+ 	gettimeofday(&now, NULL);
+ 	progress_in_time = ((double) (now.tv_sec - ckpt_start_time) +
+ 		now.tv_usec / 1000000.0) / CheckPointTimeout;
+ 
+ 	/* coordinate the progress with checkpoint_segments */
+ 	recptr = GetInsertRecPtr();
+ 	progress_in_xlog =
+ 		(((double) recptr.xlogid - (double) ckpt_start_recptr.xlogid) * XLogSegsPerFile +
+ 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+ 		CheckPointSegments;
+ 
+ 	percent = Max(progress_in_time, progress_in_xlog);
+ 	if (percent > 1.0)
+ 		percent = 1.0;
+ 
+ 	elog(DEBUG2, "GetCheckpointElapsedProgress: time=%.3f, xlog=%.3f",
+ 		progress_in_time, progress_in_xlog);
+ 
+ 	return percent;
+ }
+ 
  
  /* --------------------------------
   *		signal handler routines
***************
*** 656,661 ****
--- 798,805 ----
  	/* Set warning request flag if appropriate */
  	if (warnontime)
  		bgs->ckpt_time_warn = true;
+ 	if (waitforit)
+ 		bgs->ckpt_force = true;
  
  	/*
  	 * Send signal to request checkpoint.  When waitforit is false, we
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.220
diff -c -r1.220 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	30 May 2007 20:11:58 -0000	1.220
--- src/backend/storage/buffer/bufmgr.c	12 Jun 2007 12:43:37 -0000
***************
*** 74,79 ****
--- 74,80 ----
  double		bgwriter_all_percent = 0.333;
  int			bgwriter_lru_maxpages = 5;
  int			bgwriter_all_maxpages = 5;
+ int			checkpoint_write_rate = 1;
  
  
  long		NDirectFileRead;	/* some I/O's are direct file access. bypass
***************
*** 1002,1031 ****
   * This is called at checkpoint time to write out all dirty shared buffers.
   */
  void
! BufferSync(void)
  {
  	int			buf_id;
  	int			num_to_scan;
  	int			absorb_counter;
  
  	/*
  	 * Find out where to start the circular scan.
  	 */
! 	buf_id = StrategySyncStart();
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
  
  	/*
  	 * Loop over all buffers.
  	 */
  	num_to_scan = NBuffers;
  	absorb_counter = WRITES_PER_ABSORB;
! 	while (num_to_scan-- > 0)
  	{
! 		if (SyncOneBuffer(buf_id, false))
  		{
  			BgWriterStats.m_buf_written_checkpoints++;
  
  			/*
  			 * If in bgwriter, absorb pending fsync requests after each
--- 1003,1092 ----
   * This is called at checkpoint time to write out all dirty shared buffers.
   */
  void
! BufferSync(bool immediate)
  {
  	int			buf_id;
  	int			num_to_scan;
+ 	int			num_written;
  	int			absorb_counter;
+ 	int			writes_per_nap = checkpoint_write_rate;
+ 	int			num_to_write;
+ 	int			start_id;
+ 	int			num_written_since_nap;
  
  	/*
  	 * Find out where to start the circular scan.
  	 */
! 	start_id = StrategySyncStart();
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
  
  	/*
+ 	 * Loop over all buffers, and mark the ones that need to be written.
+ 	 */
+ 	num_to_scan = NBuffers;
+ 	num_to_write = 0;
+ 	buf_id = start_id;
+ 	while (num_to_scan-- > 0)
+ 	{
+ 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
+ 		LockBufHdr(bufHdr);
+ 
+ 		if (bufHdr->flags & BM_DIRTY)
+ 		{
+ 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ 			num_to_write++;
+ 		}
+ 		else
+ 		{
+ 			/* There shouldn't be any buffers in the cache with the flag
+ 			 * set, but better safe than sorry in case the previous checkpoint
+ 			 * crashed. If we didn't clear the flag, we might end the 
+ 			 * write-loop below early, because num_to_write wouldn't include
+ 			 * any leftover pages. Alternatively, we could count them into
+ 			 * num_to_write, but we might as well clear avoid the work.
+ 			 */
+ 			bufHdr->flags &= ~BM_CHECKPOINT_NEEDED;
+ 		}
+ 
+ 		UnlockBufHdr(bufHdr);
+ 
+ 		if (++buf_id >= NBuffers)
+ 			buf_id = 0;
+ 	}
+ 
+ 	elog(DEBUG1, "CHECKPOINT: %d / %d buffers to write", num_to_write, NBuffers);
+ 
+ 	/*
  	 * Loop over all buffers.
  	 */
  	num_to_scan = NBuffers;
+ 	num_written = num_written_since_nap = 0;
  	absorb_counter = WRITES_PER_ABSORB;
! 	buf_id = start_id;
! 	while (num_to_scan-- > 0 && num_written < num_to_write)
  	{
! 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
! 		bool needs_flush;
! 
! 		LockBufHdr(bufHdr);
! 
! 		needs_flush = (bufHdr->flags & BM_CHECKPOINT_NEEDED) != 0;
! 
! 		UnlockBufHdr(bufHdr);
! 
! 		if (needs_flush && SyncOneBuffer(buf_id, false))
  		{
  			BgWriterStats.m_buf_written_checkpoints++;
+ 			num_written++;
+ 
+ 			if (!immediate && ++num_written_since_nap >= writes_per_nap)
+ 			{
+ 				num_written_since_nap = 0;
+ 				CheckpointWriteDelay(
+ 					(double) (num_written) / num_to_write);
+ 			}
  
  			/*
  			 * If in bgwriter, absorb pending fsync requests after each
***************
*** 1053,1059 ****
  BgBufferSync(void)
  {
  	static int	buf_id1 = 0;
- 	int			buf_id2;
  	int			num_to_scan;
  	int			num_written;
  
--- 1114,1119 ----
***************
*** 1099,1104 ****
--- 1159,1177 ----
  		BgWriterStats.m_buf_written_all += num_written;
  	}
  
+ 	BgLruBufferSync();
+ }
+ 
+ /*
+  * BgLruBufferSync -- Write out some lru dirty buffers in the pool.
+  */
+ void
+ BgLruBufferSync(void)
+ {
+ 	int			buf_id2;
+ 	int			num_to_scan;
+ 	int			num_written;
+ 
  	/*
  	 * This loop considers only unpinned buffers close to the clock sweep
  	 * point.
***************
*** 1341,1349 ****
   * flushed.
   */
  void
! FlushBufferPool(void)
  {
! 	BufferSync();
  	smgrsync();
  }
  
--- 1414,1425 ----
   * flushed.
   */
  void
! FlushBufferPool(bool immediate)
  {
! 	elog(DEBUG1, "CHECKPOINT: write phase");
! 	BufferSync(immediate || checkpoint_write_percent <= 0);
! 
! 	elog(DEBUG1, "CHECKPOINT: sync phase");
  	smgrsync();
  }
  
***************
*** 2132,2138 ****
  	Assert(buf->flags & BM_IO_IN_PROGRESS);
  	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
  	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~BM_DIRTY;
  	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
--- 2208,2214 ----
  	Assert(buf->flags & BM_IO_IN_PROGRESS);
  	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
  	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
  	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.396
diff -c -r1.396 guc.c
*** src/backend/utils/misc/guc.c	8 Jun 2007 18:23:52 -0000	1.396
--- src/backend/utils/misc/guc.c	12 Jun 2007 10:50:50 -0000
***************
*** 1579,1584 ****
--- 1579,1593 ----
  	},
  
  	{
+ 		{"checkpoint_write_rate", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("XXX"),
+ 			NULL
+ 		},
+ 		&checkpoint_write_rate,
+ 		1, 0, 1000000, NULL, NULL
+ 	},
+ 
+ 	{
  		{"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE,
  			gettext_noop("Automatic log file rotation will occur after N minutes."),
  			NULL,
***************
*** 1866,1871 ****
--- 1875,1889 ----
  		0.1, 0.0, 100.0, NULL, NULL
  	},
  
+ 	{
+ 		{"checkpoint_write_percent", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the duration percentage of write phase in checkpoints."),
+ 			NULL
+ 		},
+ 		&checkpoint_write_percent,
+ 		50.0, 0.0, 100.0, NULL, NULL
+ 	},
+ 
  	/* End-of-list marker */
  	{
  		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.216
diff -c -r1.216 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample	3 Jun 2007 17:08:15 -0000	1.216
--- src/backend/utils/misc/postgresql.conf.sample	12 Jun 2007 08:16:55 -0000
***************
*** 168,173 ****
--- 168,176 ----
  
  #checkpoint_segments = 3		# in logfile segments, min 1, 16MB each
  #checkpoint_timeout = 5min		# range 30s-1h
+ #checkpoint_write_percent = 50.0		# duration percentage in write phase
+ #checkpoint_nap_percent = 10.0		# duration percentage between write and sync phases
+ #checkpoint_sync_percent = 20.0		# duration percentage in sync phase
  #checkpoint_warning = 30s		# 0 is off
  
  # - Archiving -
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.78
diff -c -r1.78 xlog.h
*** src/include/access/xlog.h	30 May 2007 20:12:02 -0000	1.78
--- src/include/access/xlog.h	12 Jun 2007 08:16:55 -0000
***************
*** 174,179 ****
--- 174,180 ----
  extern void CreateCheckPoint(bool shutdown, bool force);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
+ extern XLogRecPtr GetInsertRecPtr(void);
  extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
  
  #endif   /* XLOG_H */
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.9
diff -c -r1.9 bgwriter.h
*** src/include/postmaster/bgwriter.h	5 Jan 2007 22:19:57 -0000	1.9
--- src/include/postmaster/bgwriter.h	12 Jun 2007 08:16:55 -0000
***************
*** 20,29 ****
--- 20,35 ----
  extern int	BgWriterDelay;
  extern int	CheckPointTimeout;
  extern int	CheckPointWarning;
+ extern double	checkpoint_write_percent;
+ extern double	checkpoint_nap_percent;
+ extern double	checkpoint_sync_percent;
  
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(bool waitforit, bool warnontime);
+ extern void CheckpointWriteDelay(double progress);
+ extern void CheckpointNapDelay(double percent);
+ extern void CheckpointSyncDelay(double progress, double percent);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void AbsorbFsyncRequests(void);
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.90
diff -c -r1.90 buf_internals.h
*** src/include/storage/buf_internals.h	30 May 2007 20:12:03 -0000	1.90
--- src/include/storage/buf_internals.h	12 Jun 2007 11:42:23 -0000
***************
*** 35,40 ****
--- 35,41 ----
  #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
  #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
  #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
+ #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* this needs to be written in checkpoint */
  
  typedef bits16 BufFlags;
  
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.104
diff -c -r1.104 bufmgr.h
*** src/include/storage/bufmgr.h	30 May 2007 20:12:03 -0000	1.104
--- src/include/storage/bufmgr.h	12 Jun 2007 08:52:28 -0000
***************
*** 36,41 ****
--- 36,42 ----
  extern double bgwriter_all_percent;
  extern int	bgwriter_lru_maxpages;
  extern int	bgwriter_all_maxpages;
+ extern int	checkpoint_write_rate;
  
  /* in buf_init.c */
  extern DLLIMPORT char *BufferBlocks;
***************
*** 136,142 ****
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(void);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
--- 137,143 ----
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(bool immediate);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
***************
*** 161,168 ****
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(void);
  extern void BgBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
--- 162,170 ----
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(bool immediate);
  extern void BgBufferSync(void);
+ extern void BgLruBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
