Index: doc/src/sgml/config.sgml
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/doc/src/sgml/config.sgml,v
retrieving revision 1.126
diff -c -r1.126 config.sgml
*** doc/src/sgml/config.sgml	7 Jun 2007 19:19:56 -0000	1.126
--- doc/src/sgml/config.sgml	15 Jun 2007 09:35:32 -0000
***************
*** 1565,1570 ****
--- 1565,1586 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-checkpoint-write-percent" xreflabel="checkpoint_write_percent">
+       <term><varname>checkpoint_write_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>checkpoint_write_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To spread works in checkpoints, each checkpoint spends the specified
+         time and delays to write out all dirty buffers in the shared buffer
+         pool. The default value is 50.0 (50% of <varname>checkpoint_timeout</>).
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
        <term><varname>checkpoint_warning</varname> (<type>integer</type>)</term>
        <indexterm>
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.272
diff -c -r1.272 xlog.c
*** src/backend/access/transam/xlog.c	31 May 2007 15:13:01 -0000	1.272
--- src/backend/access/transam/xlog.c	15 Jun 2007 08:14:18 -0000
***************
*** 398,404 ****
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
--- 398,404 ----
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
! static void CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
***************
*** 5319,5324 ****
--- 5319,5340 ----
  }
  
  /*
+  * GetInsertRecPtr -- Returns the current insert position.
+  */
+ XLogRecPtr
+ GetInsertRecPtr(void)
+ {
+ 	XLogCtlInsert  *Insert = &XLogCtl->Insert;
+ 	XLogRecPtr		recptr;
+ 
+ 	LWLockAcquire(WALInsertLock, LW_SHARED);
+ 	INSERT_RECPTR(recptr, Insert, Insert->curridx);
+ 	LWLockRelease(WALInsertLock);
+ 
+ 	return recptr;
+ }
+ 
+ /*
   * Get the time of the last xlog segment switch
   */
  time_t
***************
*** 5395,5402 ****
  /*
   * Perform a checkpoint --- either during shutdown, or on-the-fly
   *
!  * If force is true, we force a checkpoint regardless of whether any XLOG
!  * activity has occurred since the last one.
   */
  void
  CreateCheckPoint(bool shutdown, bool force)
--- 5411,5421 ----
  /*
   * Perform a checkpoint --- either during shutdown, or on-the-fly
   *
!  * If force is true, we try to finish the checkpoint as soon as we can,
!  * ignoring checkpoint_write_percent and checkpoint_write_rate GUC
!  * variables. If force is false, we perform the checkpoint at a slower
!  * pace, or skip it altogether if no XLOG activity has occurred since the
!  * last one.
   */
  void
  CreateCheckPoint(bool shutdown, bool force)
***************
*** 5591,5597 ****
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo);
  
  	START_CRIT_SECTION();
  
--- 5610,5616 ----
  	 */
  	END_CRIT_SECTION();
  
! 	CheckPointGuts(checkPoint.redo, force);
  
  	START_CRIT_SECTION();
  
***************
*** 5693,5708 ****
  /*
   * Flush all data in shared memory to disk, and fsync
   *
   * This is the common code shared between regular checkpoints and
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool();			/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
--- 5712,5730 ----
  /*
   * Flush all data in shared memory to disk, and fsync
   *
+  * If immediate is true, try to finish as quickly as possible, ignoring
+  * the GUC settings to throttle checkpoint I/O.
+  *
   * This is the common code shared between regular checkpoints and
   * recovery restartpoints.
   */
  static void
! CheckPointGuts(XLogRecPtr checkPointRedo, bool immediate)
  {
  	CheckPointCLOG();
  	CheckPointSUBTRANS();
  	CheckPointMultiXact();
! 	FlushBufferPool(immediate);		/* performs all required fsyncs */
  	/* We deliberately delay 2PC checkpointing as long as possible */
  	CheckPointTwoPhase(checkPointRedo);
  }
***************
*** 5710,5716 ****
  /*
   * Set a recovery restart point if appropriate
   *
!  * This is similar to CreateCheckpoint, but is used during WAL recovery
   * to establish a point from which recovery can roll forward without
   * replaying the entire recovery log.  This function is called each time
   * a checkpoint record is read from XLOG; it must determine whether a
--- 5732,5738 ----
  /*
   * Set a recovery restart point if appropriate
   *
!  * This is similar to CreateCheckPoint, but is used during WAL recovery
   * to establish a point from which recovery can roll forward without
   * replaying the entire recovery log.  This function is called each time
   * a checkpoint record is read from XLOG; it must determine whether a
***************
*** 5751,5757 ****
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
--- 5773,5779 ----
  	/*
  	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, true);
  
  	/*
  	 * Update pg_control so that any subsequent crash will restart from this
***************
*** 6748,6750 ****
--- 6770,6773 ----
  
  	pfree(buf.data);
  }
+ 
Index: src/backend/commands/dbcommands.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/dbcommands.c,v
retrieving revision 1.195
diff -c -r1.195 dbcommands.c
*** src/backend/commands/dbcommands.c	1 Jun 2007 19:38:07 -0000	1.195
--- src/backend/commands/dbcommands.c	12 Jun 2007 08:16:55 -0000
***************
*** 404,410 ****
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync();
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
--- 404,410 ----
  	 * up-to-date for the copy.  (We really only need to flush buffers for the
  	 * source database, but bufmgr.c provides no API for that.)
  	 */
! 	BufferSync(true);
  
  	/*
  	 * Once we start copying subdirectories, we need to be able to clean 'em
***************
*** 1427,1433 ****
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync();
  
  		/*
  		 * Copy this subdirectory to the new location
--- 1427,1433 ----
  		 * up-to-date for the copy.  (We really only need to flush buffers for
  		 * the source database, but bufmgr.c provides no API for that.)
  		 */
! 		BufferSync(true);
  
  		/*
  		 * Copy this subdirectory to the new location
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.38
diff -c -r1.38 bgwriter.c
*** src/backend/postmaster/bgwriter.c	27 May 2007 03:50:39 -0000	1.38
--- src/backend/postmaster/bgwriter.c	15 Jun 2007 09:33:03 -0000
***************
*** 44,49 ****
--- 44,50 ----
  #include "postgres.h"
  
  #include <signal.h>
+ #include <sys/time.h>
  #include <time.h>
  #include <unistd.h>
  
***************
*** 117,122 ****
--- 118,124 ----
  	sig_atomic_t ckpt_failed;	/* advances when checkpoint fails */
  
  	sig_atomic_t ckpt_time_warn;	/* warn if too soon since last ckpt? */
+ 	sig_atomic_t ckpt_force;		/* any waiter for the checkpoint? */
  
  	int			num_requests;	/* current # of requests */
  	int			max_requests;	/* allocated array size */
***************
*** 131,136 ****
--- 133,139 ----
  int			BgWriterDelay = 200;
  int			CheckPointTimeout = 300;
  int			CheckPointWarning = 30;
+ double		checkpoint_write_percent = 50.0;
  
  /*
   * Flags set by interrupt handlers for later service in the main loop.
***************
*** 146,155 ****
--- 149,167 ----
  
  static bool ckpt_active = false;
  
+ /* Current time and WAL insert location when checkpoint was started */
+ static time_t		ckpt_start_time;
+ static XLogRecPtr	ckpt_start_recptr;
+ 
+ static double		ckpt_cached_elapsed;
+ 
  static time_t last_checkpoint_time;
  static time_t last_xlog_switch_time;
  
  
+ static void CheckArchiveTimeout(void);
+ static void BgWriterNap(void);
+ static bool IsCheckpointOnSchedule(double progress);
  static void bg_quickdie(SIGNAL_ARGS);
  static void BgSigHupHandler(SIGNAL_ARGS);
  static void ReqCheckpointHandler(SIGNAL_ARGS);
***************
*** 331,337 ****
  		bool		force_checkpoint = false;
  		time_t		now;
  		int			elapsed_secs;
- 		long		udelay;
  
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
--- 343,348 ----
***************
*** 354,360 ****
  		{
  			checkpoint_requested = false;
  			do_checkpoint = true;
! 			force_checkpoint = true;
  			BgWriterStats.m_requested_checkpoints++;
  		}
  		if (shutdown_requested)
--- 365,371 ----
  		{
  			checkpoint_requested = false;
  			do_checkpoint = true;
! 			force_checkpoint = BgWriterShmem->ckpt_force;
  			BgWriterStats.m_requested_checkpoints++;
  		}
  		if (shutdown_requested)
***************
*** 377,387 ****
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
  			do_checkpoint = true;
! 			if (!force_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
--- 388,397 ----
  		 */
  		now = time(NULL);
  		elapsed_secs = now - last_checkpoint_time;
! 		if (!do_checkpoint && elapsed_secs >= CheckPointTimeout)
  		{
  			do_checkpoint = true;
! 			BgWriterStats.m_timed_checkpoints++;
  		}
  
  		/*
***************
*** 404,409 ****
--- 414,420 ----
  								elapsed_secs),
  						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  			BgWriterShmem->ckpt_time_warn = false;
+ 			BgWriterShmem->ckpt_force = false;
  
  			/*
  			 * Indicate checkpoint start to any waiting backends.
***************
*** 411,416 ****
--- 422,433 ----
  			ckpt_active = true;
  			BgWriterShmem->ckpt_started++;
  
+ 			elog(DEBUG1, "CHECKPOINT: start");
+ 
+ 			ckpt_start_recptr = GetInsertRecPtr();
+ 			ckpt_start_time = now;
+ 			ckpt_cached_elapsed = 0;
+ 
  			CreateCheckPoint(false, force_checkpoint);
  
  			/*
***************
*** 423,428 ****
--- 440,446 ----
  			 * Indicate checkpoint completion to any waiting backends.
  			 */
  			BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+ 			elog(DEBUG1, "CHECKPOINT: end");
  			ckpt_active = false;
  
  			/*
***************
*** 436,446 ****
  			BgBufferSync();
  
  		/*
! 		 * Check for archive_timeout, if so, switch xlog files.  First we do a
! 		 * quick check using possibly-stale local state.
  		 */
! 		if (XLogArchiveTimeout > 0 &&
! 			(int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
--- 454,486 ----
  			BgBufferSync();
  
  		/*
! 		 * Check for archive_timeout, if so, switch xlog files.
  		 */
! 		CheckArchiveTimeout();
! 
! 		/* Nap for the configured time. */
! 		BgWriterNap();
! 	}
! }
! 
! /*
!  * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
!  *		if needed
!  */
! static void
! CheckArchiveTimeout(void)
! {
! 	time_t		now;
! 
! 	if (XLogArchiveTimeout <= 0)
! 		return;
! 
! 	now = time(NULL);
! 
! 	/* First we do a quick check using possibly-stale local state. */
! 	if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
! 		return;
! 
  		{
  			/*
  			 * Update local state ... note that last_xlog_switch_time is the
***************
*** 450,459 ****
  
  			last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
  
- 			/* if we did a checkpoint, 'now' might be stale too */
- 			if (do_checkpoint)
- 				now = time(NULL);
- 
  			/* Now we can do the real check */
  			if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
  			{
--- 490,495 ----
***************
*** 478,483 ****
--- 514,531 ----
  				last_xlog_switch_time = now;
  			}
  		}
+ }
+ 
+ /*
+  * BgWriterNap -- short nap in bgwriter
+  *
+  * Nap for the shorter time of the configured time or the XXX mdelay unless
+  * it is zero. Return the actual nap time in msec.
+  */
+ static void
+ BgWriterNap(void)
+ {
+ 	long		udelay;
  
  		/*
  		 * Send off activity statistics to the stats collector
***************
*** 496,502 ****
  		 * We absorb pending requests after each short sleep.
  		 */
  		if ((bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0) ||
! 			(bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0))
  			udelay = BgWriterDelay * 1000L;
  		else if (XLogArchiveTimeout > 0)
  			udelay = 1000000L;	/* One second */
--- 544,551 ----
  		 * We absorb pending requests after each short sleep.
  		 */
  		if ((bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0) ||
! 			(bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0) ||
! 			ckpt_active)
  			udelay = BgWriterDelay * 1000L;
  		else if (XLogArchiveTimeout > 0)
  			udelay = 1000000L;	/* One second */
***************
*** 514,522 ****
--- 563,701 ----
  
  		if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
  			pg_usleep(udelay);
+ }
+ 
+ /*
+  * CheckpointWriteDelay -- periodical sleep in checkpoint write phase
+  *
+  * During checkpoint, this is called periodically by the buffer manager while 
+  * writing out dirty buffers from the shared buffer cache. We estimate if we've
+  * made enough progress so that we're going to finish this checkpoint in time
+  * before the next one is due, taking checkpoint_write_percent into account.
+  * If so, we perform one round of normal bgwriter activity including LRU-
+  * cleaning of buffer cache, switching xlog segment if archive_timeout has
+  * passed, and sleeping for BgWriterDelay msecs.
+  *
+  * 'progress' is an estimate of how much of the writes has been done, as a
+  * fraction between 0.0 meaning none, and 1.0 meaning all done.
+  */
+ void
+ CheckpointWriteDelay(double progress)
+ {
+ 	/*
+ 	 * Return immediately if we should finish the checkpoint ASAP.
+ 	 *
+ 	 * We could reload config here, but it gets complicated to handle changes
+ 	 * to settings that affect the checkpoint I/O throttling correctly.
+ 	 * Therefore we just finish ASAP if got_SIGHUP is set, and reload the
+ 	 * config after the checkpoint is finished.
+ 	 */
+ 	if (!am_bg_writer || checkpoint_write_percent <= 0 || got_SIGHUP || 
+ 		checkpoint_requested || shutdown_requested)
+ 		return;
+ 
+ 	elog(DEBUG1, "CheckpointWriteDelay: progress=%.3f", progress);
+ 
+ 	/* Take a nap and perform the usual bgwriter duties, unless we're behind
+ 	 * schedule, in which case we just try to catch up as quickly as possible.
+ 	 */
+ 	if (IsCheckpointOnSchedule(progress))
+ 	{
+ 		AbsorbFsyncRequests();
+ 		CheckArchiveTimeout();
+ 		BgLruBufferSync();
+ 		BgWriterNap();
  	}
  }
  
+ /*
+  * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
+  *		 in time?
+  *
+  * Compares the current progress against the time/segments elapsed since last
+  * checkpoint, and returns true if the progress we've made this far is greater
+  * than the elapsed time/segments.
+  *
+  * If another checkpoint has already been requested, always return false.
+  */
+ static bool
+ IsCheckpointOnSchedule(double progress)
+ {
+ 	struct timeval	now;
+ 	XLogRecPtr		recptr;
+ 	double			progress_in_time,
+ 					progress_in_xlog;
+ 
+ 	Assert(ckpt_active);
+ 
+ 	/* scale progress according to checkpoint_write_percent */
+ 	progress *= (checkpoint_write_percent / 100);
+ 
+ 	/*
+ 	 * Check against the cached value first. Only do the more expensive 
+ 	 * calculations once we reach the target previously calculated. Since
+ 	 * neither time or WAL insert pointer moves backwards, a freshly
+ 	 * calculated value can only be greater than or equal to the cached value.
+ 	 */
+ 	if (progress < ckpt_cached_elapsed)
+ 	{
+ 		elog(DEBUG2, "IsCheckpointOnSchedule: Still behind cached=%.3f, progress=%.3f",
+ 			 ckpt_cached_elapsed, progress);
+ 		return false;
+ 	}
+ 
+ 	gettimeofday(&now, NULL);
+ 	
+ 	/*
+ 	 * Check progress against time elapsed and checkpoint_timeout.
+ 	 */
+ 	progress_in_time = ((double) (now.tv_sec - ckpt_start_time) +
+ 		now.tv_usec / 1000000.0) / CheckPointTimeout;
+ 
+ 	if (progress < progress_in_time)
+ 	{
+ 		elog(DEBUG2, "IsCheckpointOnSchedule: Behind checkpoint_timeout, time=%.3f, progress=%.3f",
+ 			 progress_in_time, progress);
+ 
+ 		ckpt_cached_elapsed = progress_in_time;
+ 
+ 		return false;
+ 	}
+ 
+ 	/*
+ 	 * Check progress against WAL segments written and checkpoint_segments.
+ 	 *
+ 	 * We compare the current WAL insert location against the location 
+ 	 * computed before calling CreateCheckPoint. The code in XLogInsert that
+ 	 * actually triggers a checkpoint when checkpoint_segments is exceeded
+ 	 * compares against RedoRecptr, so this is not completely accurate.
+ 	 * However, it's good enough for our purposes, we're only calculating
+ 	 * an estimate anyway.
+ 	 */
+ 	recptr = GetInsertRecPtr();
+ 	progress_in_xlog =
+ 		(((double) recptr.xlogid - (double) ckpt_start_recptr.xlogid) * XLogSegsPerFile +
+ 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+ 		CheckPointSegments;
+ 
+ 	if (progress < progress_in_xlog)
+ 	{
+ 		elog(DEBUG2, "IsCheckpointOnSchedule: Behind checkpoint_segments, xlog=%.3f, progress=%.3f",
+ 			 progress_in_xlog, progress);
+ 
+ 		ckpt_cached_elapsed = progress_in_xlog;
+ 
+ 		return false;
+ 	}
+ 
+ 
+ 	/* It looks like we're on schedule. */
+ 
+ 	elog(DEBUG2, "IsCheckpointOnSchedule: on schedule, time=%.3f, xlog=%.3f progress=%.3f",
+ 		 progress_in_time, progress_in_xlog, progress);
+ 
+ 	return true;
+ }
  
  /* --------------------------------
   *		signal handler routines
***************
*** 656,661 ****
--- 835,842 ----
  	/* Set warning request flag if appropriate */
  	if (warnontime)
  		bgs->ckpt_time_warn = true;
+ 	if (waitforit)
+ 		bgs->ckpt_force = true;
  
  	/*
  	 * Send signal to request checkpoint.  When waitforit is false, we
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.220
diff -c -r1.220 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	30 May 2007 20:11:58 -0000	1.220
--- src/backend/storage/buffer/bufmgr.c	15 Jun 2007 09:20:17 -0000
***************
*** 74,79 ****
--- 74,80 ----
  double		bgwriter_all_percent = 0.333;
  int			bgwriter_lru_maxpages = 5;
  int			bgwriter_all_maxpages = 5;
+ double		checkpoint_write_rate = 100; /* in pages/s */
  
  
  long		NDirectFileRead;	/* some I/O's are direct file access. bypass
***************
*** 645,651 ****
  	 * at 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
  	buf->flags |= BM_TAG_VALID;
  	buf->usage_count = 1;
  
--- 646,652 ----
  	 * at 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR);
  	buf->flags |= BM_TAG_VALID;
  	buf->usage_count = 1;
  
***************
*** 1000,1037 ****
   * BufferSync -- Write out all dirty buffers in the pool.
   *
   * This is called at checkpoint time to write out all dirty shared buffers.
   */
  void
! BufferSync(void)
  {
! 	int			buf_id;
  	int			num_to_scan;
  	int			absorb_counter;
  
  	/*
  	 * Find out where to start the circular scan.
  	 */
! 	buf_id = StrategySyncStart();
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
  
  	/*
! 	 * Loop over all buffers.
  	 */
  	num_to_scan = NBuffers;
  	absorb_counter = WRITES_PER_ABSORB;
  	while (num_to_scan-- > 0)
  	{
! 		if (SyncOneBuffer(buf_id, false))
  		{
  			BgWriterStats.m_buf_written_checkpoints++;
  
  			/*
  			 * If in bgwriter, absorb pending fsync requests after each
  			 * WRITES_PER_ABSORB write operations, to prevent overflow of the
  			 * fsync request queue.  If not in bgwriter process, this is a
  			 * no-op.
  			 */
  			if (--absorb_counter <= 0)
  			{
--- 1001,1122 ----
   * BufferSync -- Write out all dirty buffers in the pool.
   *
   * This is called at checkpoint time to write out all dirty shared buffers.
+  * If 'immediate' is true, write them all ASAP, otherwise throttle the
+  * I/O rate according to checkpoint_write_rate GUC variable, and perform
+  * normal bgwriter duties periodically.
   */
  void
! BufferSync(bool immediate)
  {
! 	int			buf_id, start_id;
  	int			num_to_scan;
+ 	int			num_to_write;
+ 	int			num_written;
  	int			absorb_counter;
+ 	int			num_written_since_nap;
+ 	int			writes_per_nap;
+ 
+ 	/*
+ 	 * Convert checkpoint_write_rate to number writes of writes to perform in
+ 	 * a period of BgWriterDelay. The result is an integer, so we lose some
+ 	 * precision here. There's a lot of other factors as well that affect the
+ 	 * real rate, for example granularity of OS timer used for BgWriterDelay,
+ 	 * whether any of the writes block, and time spent in CheckpointWriteDelay
+ 	 * performing normal bgwriter duties.
+ 	 */
+ 	writes_per_nap = Min(1, (1000 * checkpoint_write_rate) / BgWriterDelay);
  
  	/*
  	 * Find out where to start the circular scan.
  	 */
! 	start_id = StrategySyncStart();
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
  
  	/*
! 	 * Loop over all buffers, and mark the ones that need to be written with
! 	 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_write), so that we
! 	 * can estimate how much work needs to be done.
! 	 *
! 	 * This allows us to only write those pages that were dirty when the
! 	 * checkpoint began, and haven't been flushed to disk since. Whenever a
! 	 * page with BM_CHECKPOINT_NEEDED is written out by normal backends or
! 	 * the bgwriter LRU-scan, the flag is cleared, and any pages dirtied after
! 	 * this scan don't have the flag set.
  	 */
  	num_to_scan = NBuffers;
+ 	num_to_write = 0;
+ 	buf_id = start_id;
+ 	while (num_to_scan-- > 0)
+ 	{
+ 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
+ 
+ 		/*
+ 		 * Header spinlock is enough to examine BM_DIRTY, see comment in
+ 		 * SyncOneBuffer.
+ 		 */
+ 		LockBufHdr(bufHdr);
+ 
+ 		if (bufHdr->flags & BM_DIRTY)
+ 		{
+ 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ 			num_to_write++;
+ 		}
+ 
+ 		UnlockBufHdr(bufHdr);
+ 
+ 		if (++buf_id >= NBuffers)
+ 			buf_id = 0;
+ 	}
+ 
+ 	elog(DEBUG1, "CHECKPOINT: %d / %d buffers to write", num_to_write, NBuffers);
+ 
+ 	/*
+ 	 * Loop over all buffers again, and write the ones (still) marked with
+ 	 * BM_CHECKPOINT_NEEDED.
+ 	 */
+ 	num_to_scan = NBuffers;
+ 	num_written = num_written_since_nap = 0;
  	absorb_counter = WRITES_PER_ABSORB;
+ 	buf_id = start_id;
  	while (num_to_scan-- > 0)
  	{
! 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
! 		bool needs_flush;
! 
! 		LockBufHdr(bufHdr);
! 
! 		needs_flush = (bufHdr->flags & BM_CHECKPOINT_NEEDED) != 0;
! 
! 		UnlockBufHdr(bufHdr);
! 
! 		if (needs_flush && SyncOneBuffer(buf_id, false))
  		{
  			BgWriterStats.m_buf_written_checkpoints++;
+ 			num_written++;
+ 
+ 			/*
+ 			 * Perform normal bgwriter duties and sleep to throttle
+ 			 * our I/O rate.
+ 			 */
+ 			if (!immediate && ++num_written_since_nap >= writes_per_nap)
+ 			{
+ 				num_written_since_nap = 0;
+ 				CheckpointWriteDelay((double) (num_written) / num_to_write);
+ 			}
  
  			/*
  			 * If in bgwriter, absorb pending fsync requests after each
  			 * WRITES_PER_ABSORB write operations, to prevent overflow of the
  			 * fsync request queue.  If not in bgwriter process, this is a
  			 * no-op.
+ 			 *
+ 			 * AbsorbFsyncRequests is also called inside CheckpointWriteDelay,
+ 			 * so this is partially redundant. However, we can't totally trust
+ 			 * on the call in CheckpointWriteDelay, because it's only made
+ 			 * before sleeping. In case CheckpointWriteDelay doesn't sleep,
+ 			 * we need to absorb pending requests ourselves.
  			 */
  			if (--absorb_counter <= 0)
  			{
***************
*** 1053,1059 ****
  BgBufferSync(void)
  {
  	static int	buf_id1 = 0;
- 	int			buf_id2;
  	int			num_to_scan;
  	int			num_written;
  
--- 1138,1143 ----
***************
*** 1099,1104 ****
--- 1183,1201 ----
  		BgWriterStats.m_buf_written_all += num_written;
  	}
  
+ 	BgLruBufferSync();
+ }
+ 
+ /*
+  * BgLruBufferSync -- Write out some lru dirty buffers in the pool.
+  */
+ void
+ BgLruBufferSync(void)
+ {
+ 	int			buf_id2;
+ 	int			num_to_scan;
+ 	int			num_written;
+ 
  	/*
  	 * This loop considers only unpinned buffers close to the clock sweep
  	 * point.
***************
*** 1341,1349 ****
   * flushed.
   */
  void
! FlushBufferPool(void)
  {
! 	BufferSync();
  	smgrsync();
  }
  
--- 1438,1449 ----
   * flushed.
   */
  void
! FlushBufferPool(bool immediate)
  {
! 	elog(DEBUG1, "CHECKPOINT: write phase");
! 	BufferSync(immediate || checkpoint_write_percent <= 0);
! 
! 	elog(DEBUG1, "CHECKPOINT: sync phase");
  	smgrsync();
  }
  
***************
*** 2132,2138 ****
  	Assert(buf->flags & BM_IO_IN_PROGRESS);
  	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
  	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~BM_DIRTY;
  	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
--- 2232,2238 ----
  	Assert(buf->flags & BM_IO_IN_PROGRESS);
  	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
  	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
  	buf->flags |= set_flag_bits;
  
  	UnlockBufHdr(buf);
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.396
diff -c -r1.396 guc.c
*** src/backend/utils/misc/guc.c	8 Jun 2007 18:23:52 -0000	1.396
--- src/backend/utils/misc/guc.c	14 Jun 2007 21:17:15 -0000
***************
*** 1866,1871 ****
--- 1866,1890 ----
  		0.1, 0.0, 100.0, NULL, NULL
  	},
  
+ 	{
+ 		{"checkpoint_write_percent", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("XXX Sets the duration percentage of write phase in checkpoints."),
+ 			NULL
+ 		},
+ 		&checkpoint_write_percent,
+ 		50.0, 0.0, 100.0, NULL, NULL
+ 	},
+ 
+ 	{
+ 		{"checkpoint_write_rate", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Minimum rate to write dirty buffers during checkpoints, in pages/s."),
+ 			NULL,
+ 			GUC_UNIT_BLOCKS
+ 		},
+ 		&checkpoint_write_rate,
+ 		100, 0.0, 10000.0, NULL, NULL
+ 	},
+ 
  	/* End-of-list marker */
  	{
  		{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.216
diff -c -r1.216 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample	3 Jun 2007 17:08:15 -0000	1.216
--- src/backend/utils/misc/postgresql.conf.sample	14 Jun 2007 21:17:42 -0000
***************
*** 168,173 ****
--- 168,175 ----
  
  #checkpoint_segments = 3		# in logfile segments, min 1, 16MB each
  #checkpoint_timeout = 5min		# range 30s-1h
+ #checkpoint_write_percent = 50.0	# XXX duration percentage in write phase
+ #checkpoint_write_rate = 100.0		# XXX
  #checkpoint_warning = 30s		# 0 is off
  
  # - Archiving -
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.78
diff -c -r1.78 xlog.h
*** src/include/access/xlog.h	30 May 2007 20:12:02 -0000	1.78
--- src/include/access/xlog.h	12 Jun 2007 08:16:55 -0000
***************
*** 174,179 ****
--- 174,180 ----
  extern void CreateCheckPoint(bool shutdown, bool force);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
+ extern XLogRecPtr GetInsertRecPtr(void);
  extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
  
  #endif   /* XLOG_H */
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.9
diff -c -r1.9 bgwriter.h
*** src/include/postmaster/bgwriter.h	5 Jan 2007 22:19:57 -0000	1.9
--- src/include/postmaster/bgwriter.h	15 Jun 2007 08:11:17 -0000
***************
*** 20,29 ****
--- 20,31 ----
  extern int	BgWriterDelay;
  extern int	CheckPointTimeout;
  extern int	CheckPointWarning;
+ extern double	checkpoint_write_percent;
  
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(bool waitforit, bool warnontime);
+ extern void CheckpointWriteDelay(double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void AbsorbFsyncRequests(void);
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.90
diff -c -r1.90 buf_internals.h
*** src/include/storage/buf_internals.h	30 May 2007 20:12:03 -0000	1.90
--- src/include/storage/buf_internals.h	12 Jun 2007 11:42:23 -0000
***************
*** 35,40 ****
--- 35,41 ----
  #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
  #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
  #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
+ #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* this needs to be written in checkpoint */
  
  typedef bits16 BufFlags;
  
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.104
diff -c -r1.104 bufmgr.h
*** src/include/storage/bufmgr.h	30 May 2007 20:12:03 -0000	1.104
--- src/include/storage/bufmgr.h	14 Jun 2007 12:50:13 -0000
***************
*** 36,41 ****
--- 36,42 ----
  extern double bgwriter_all_percent;
  extern int	bgwriter_lru_maxpages;
  extern int	bgwriter_all_maxpages;
+ extern double	checkpoint_write_rate;
  
  /* in buf_init.c */
  extern DLLIMPORT char *BufferBlocks;
***************
*** 136,142 ****
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(void);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
--- 137,143 ----
  extern void ResetBufferUsage(void);
  extern void AtEOXact_Buffers(bool isCommit);
  extern void PrintBufferLeakWarning(Buffer buffer);
! extern void FlushBufferPool(bool immediate);
  extern BlockNumber BufferGetBlockNumber(Buffer buffer);
  extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
***************
*** 161,168 ****
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(void);
  extern void BgBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
--- 162,170 ----
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BufferSync(bool immediate);
  extern void BgBufferSync(void);
+ extern void BgLruBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
