Streaming replication and a disk full in primary

Started by Fujii Masaoalmost 16 years ago18 messages
#1Fujii Masao
masao.fujii@gmail.com
1 attachment(s)

Hi,

If the primary has a connected standby, the WAL files required for
the standby cannot be deleted. So if it has fallen too far behind
for some reasons, a disk full failure might occur on the primary.
This is one of the problems that should be fixed for v9.0.

We can cope with that case by carefully monitoring the standby lag.
In addition to this, I think that we should put an upper limit on
the number of WAL files held in pg_xlog for the standby (i.e.,
the maximum delay of the standby) as a safeguard against a disk
full error.

The attached patch introduces new GUC 'replication_lag_segments'
which specifies the maximum number of WAL files held in pg_xlog
to send to the standby. The replication to the standby which
falls more than the upper limit behind is automatically terminated,
which would avoid a disk full erro on the primary.

This GUC is also useful to hold some WAL files for the incoming
standby. This would avoid the problem that a WAL file required
for the standby doesn't exist in the primary at the start of
replication, to some extent.

The code is also available in the 'replication' branch in my
git repository.

git://git.postgresql.org/git/users/fujii/postgres.git
branch: replication

Comment? Objection? Review?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_lag_segments_0121.patchtext/x-patch; charset=US-ASCII; name=replication_lag_segments_0121.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1789,1794 **** archive_command = 'copy "%p" "C:\\server\\archivedir\\%f"'  # Windows
--- 1789,1812 ----
         </para>
         </listitem>
        </varlistentry>
+       <varlistentry id="guc-replication-lag-segments" xreflabel="replication_lag_segments">
+        <term><varname>replication_lag_segments</varname> (<type>integer</type>)</term>
+        <indexterm>
+         <primary><varname>replication_lag_segments</> configuration parameter</primary>
+        </indexterm>
+        <listitem>
+        <para>
+         Specifies the maximum number of log file segments held in <filename>pg_xlog</>
+         directory to send to the standby server (each segment is normally 16 megabytes).
+         The replication to the standby server which falls more than <varname>
+         replication_lag_segments</> behind is terminated. This is useful for
+         avoiding a disk full error on the primary and holding the segments required for
+         the incoming standby server. The default value is zero, which disables that
+         upper limit. This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+        </listitem>
+       </varlistentry>
       </variablelist>
      </sect2>
      <sect2 id="runtime-config-standby">
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 1725,1730 **** XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
--- 1725,1736 ----
  				if (XLogArchivingActive())
  					XLogArchiveNotifySeg(openLogId, openLogSeg);
  
+ 				/*
+ 				 * Check for the standbys' delay and terminate replication
+ 				 * if needed.
+ 				 */
+ 				CheckStandbysDelay(LogwrtResult.Write);
+ 
  				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
  
  				/*
***************
*** 7213,7240 **** CreateCheckPoint(int flags)
  	smgrpostckpt();
  
  	/*
! 	 * If there's connected standby servers doing XLOG streaming, don't
! 	 * delete XLOG files that have not been streamed to all of them yet.
! 	 * This does nothing to prevent them from being deleted when the
! 	 * standby is disconnected (e.g because of network problems), but at
! 	 * least it avoids an open replication connection from failing because
  	 * of that.
  	 */
  	if ((_logId || _logSeg) && MaxWalSenders > 0)
  	{
- 		XLogRecPtr oldest;
  		uint32	log;
  		uint32	seg;
  
! 		oldest = GetOldestWALSendPointer();
! 		if (oldest.xlogid != 0 || oldest.xrecoff != 0)
  		{
! 			XLByteToSeg(oldest, log, seg);
! 			if (log < _logId || (log == _logId && seg < _logSeg))
! 			{
! 				_logId	= log;
! 				_logSeg	= seg;
! 			}
  		}
  	}
  
--- 7219,7260 ----
  	smgrpostckpt();
  
  	/*
! 	 * Don't delete XLOG files which could be still required for
! 	 * connected or incoming standbys, under the given upper limit.
! 	 * This avoids a replication connection from failing because
  	 * of that.
  	 */
  	if ((_logId || _logSeg) && MaxWalSenders > 0)
  	{
  		uint32	log;
  		uint32	seg;
+ 		bool	need_comp = true;
  
! 		if (RepLagSegs > 0)
  		{
! 			/*
! 			 * Ensure that there is no too lagged standbys before
! 			 * deleting XLOG files.
! 			 */
! 			CheckStandbysDelay(recptr);
! 			XLByteToSeg(recptr, log, seg);
! 			PrevLogSegs(log, seg, RepLagSegs - 1);
! 		}
! 		else
! 		{
! 			XLogRecPtr oldest;
! 
! 			oldest = GetOldestWALSendPointer();
! 			if (oldest.xlogid == 0 || oldest.xrecoff == 0)
! 				need_comp = false;
! 			else
! 				XLByteToSeg(oldest, log, seg);
! 		}
! 
! 		if (need_comp && (log < _logId || (log == _logId && seg < _logSeg)))
! 		{
! 			_logId	= log;
! 			_logSeg	= seg;
  		}
  	}
  
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 66,71 **** bool	am_walsender	= false;	/* Am I a walsender process ? */
--- 66,72 ----
  /* User-settable parameters for walsender */
  int	MaxWalSenders = 0;		/* the maximum number of concurrent walsenders */
  int	WalSndDelay	= 200;		/* max sleep time between some actions */
+ int	RepLagSegs	= 0;		/* the maximum number of WAL files held for standby */
  
  #define NAPTIME_PER_CYCLE 100	/* max sleep time between cycles (100ms) */
  
***************
*** 86,94 **** static XLogRecPtr sentPtr = {0, 0};
  
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
! static volatile sig_atomic_t shutdown_requested = false;
  static volatile sig_atomic_t ready_to_stop = false;
  
  /* Signal handlers */
  static void WalSndSigHupHandler(SIGNAL_ARGS);
  static void WalSndShutdownHandler(SIGNAL_ARGS);
--- 87,150 ----
  
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
! static volatile sig_atomic_t got_SIGTERM = false;
  static volatile sig_atomic_t ready_to_stop = false;
  
+ static void ProcessWalSndInterrupts(void);
+ static void EnableWalSndImmediateExit(void);
+ static void DisableWalSndImmediateExit(void);
+ 
+ /*
+  * About SIGTERM handling:
+  *
+  * We can't just exit(1) within SIGTERM signal handler, because the signal
+  * might arrive in the middle of some critical operation, like while we're
+  * holding a spinlock. We also can't just set a flag in signal handler and
+  * check it in the main loop, because we perform some blocking libpq
+  * operations like pq_flush(), which can take a long time to finish.
+  *
+  * We use a combined approach: When WalSndImmediateInterruptOK is true, it's
+  * safe for the signal handler to elog(FATAL) immediately. Otherwise it just
+  * sets got_SIGTERM flag, which is checked in the main loop when convenient.
+  *
+  * This is very much like what regular backends do with ImmediateInterruptOK,
+  * ProcessInterrupts() etc.
+  */
+ static volatile bool WalSndImmediateInterruptOK = false;
+ 
+ static void
+ ProcessWalSndInterrupts(void)
+ {
+ 	/*
+ 	 * Although walsender interrupt handling doesn't use the same scheme
+ 	 * as regular backends, call CHECK_FOR_INTERRUPTS() to make sure we
+ 	 * receive any incoming signals on Win32.
+ 	 */
+ 	CHECK_FOR_INTERRUPTS();
+ 
+ 	if (got_SIGTERM)
+ 	{
+ 		WalSndImmediateInterruptOK = false;
+ 		ereport(FATAL,
+ 				(errcode(ERRCODE_ADMIN_SHUTDOWN),
+ 				 errmsg("terminating replication because the standby falls too far behind")));
+ 	}
+ }
+ 
+ static void
+ EnableWalSndImmediateExit()
+ {
+ 	WalSndImmediateInterruptOK = true;
+ 	ProcessWalSndInterrupts();
+ }
+ 
+ static void
+ DisableWalSndImmediateExit()
+ {
+ 	WalSndImmediateInterruptOK = false;
+ 	ProcessWalSndInterrupts();
+ }
+ 
  /* Signal handlers */
  static void WalSndSigHupHandler(SIGNAL_ARGS);
  static void WalSndShutdownHandler(SIGNAL_ARGS);
***************
*** 386,396 **** WalSndLoop(void)
  		if (ready_to_stop)
  		{
  			XLogSend(&output_message);
! 			shutdown_requested = true;
  		}
  
  		/* Normal exit from the walsender is here */
! 		if (shutdown_requested)
  		{
  			/* Inform the standby that XLOG streaming was done */
  			pq_puttextmessage('C', "COPY 0");
--- 442,452 ----
  		if (ready_to_stop)
  		{
  			XLogSend(&output_message);
! 			got_SIGTERM = true;
  		}
  
  		/* Normal exit from the walsender is here */
! 		if (got_SIGTERM)
  		{
  			/* Inform the standby that XLOG streaming was done */
  			pq_puttextmessage('C', "COPY 0");
***************
*** 410,416 **** WalSndLoop(void)
  		remain = WalSndDelay;
  		while (remain > 0)
  		{
! 			if (got_SIGHUP || shutdown_requested || ready_to_stop)
  				break;
  
  			/*
--- 466,472 ----
  		remain = WalSndDelay;
  		while (remain > 0)
  		{
! 			if (got_SIGHUP || got_SIGTERM || ready_to_stop)
  				break;
  
  			/*
***************
*** 586,591 **** XLogSend(StringInfo outMsg)
--- 642,648 ----
  {
  	XLogRecPtr	SendRqstPtr;
  	char	activitymsg[50];
+ 	int	res;
  	/* use volatile pointer to prevent code rearrangement */
  	volatile WalSnd *walsnd = MyWalSnd;
  
***************
*** 676,682 **** XLogSend(StringInfo outMsg)
--- 733,741 ----
  		outMsg->len += nbytes;
  		outMsg->data[outMsg->len] = '\0';
  
+ 		EnableWalSndImmediateExit();
  		pq_putmessage('d', outMsg->data, outMsg->len);
+ 		DisableWalSndImmediateExit();
  		resetStringInfo(outMsg);
  	}
  
***************
*** 686,692 **** XLogSend(StringInfo outMsg)
  	SpinLockRelease(&walsnd->mutex);
  
  	/* Flush pending output */
! 	if (pq_flush())
  		return false;
  
  	/* Report progress of XLOG streaming in PS display */
--- 745,754 ----
  	SpinLockRelease(&walsnd->mutex);
  
  	/* Flush pending output */
! 	EnableWalSndImmediateExit();
! 	res = pq_flush();
! 	DisableWalSndImmediateExit();
! 	if (res)
  		return false;
  
  	/* Report progress of XLOG streaming in PS display */
***************
*** 704,714 **** WalSndSigHupHandler(SIGNAL_ARGS)
  	got_SIGHUP = true;
  }
  
! /* SIGTERM: set flag to shut down */
  static void
  WalSndShutdownHandler(SIGNAL_ARGS)
  {
! 	shutdown_requested = true;
  }
  
  /*
--- 766,780 ----
  	got_SIGHUP = true;
  }
  
! /* SIGTERM: set flag for main loop, or shutdown immediately if safe */
  static void
  WalSndShutdownHandler(SIGNAL_ARGS)
  {
! 	got_SIGTERM = true;
! 
! 	/* Don't joggle the elbow of proc_exit */
! 	if (!proc_exit_inprogress && WalSndImmediateInterruptOK)
! 		ProcessWalSndInterrupts();
  }
  
  /*
***************
*** 844,846 **** GetOldestWALSendPointer(void)
--- 910,976 ----
  	}
  	return oldest;
  }
+ 
+ /*
+  * Check if the standbys have fallen too far behind.
+  */
+ void
+ CheckStandbysDelay(XLogRecPtr newptr)
+ {
+ 	uint32	old_segno;
+ 	uint32	new_segno;
+ 	uint32	old_highbits;
+ 	uint32	new_highbits;
+ 	int	i;
+ 	bool	found = false;
+ 
+ 	if (RepLagSegs <= 0)
+ 		return;
+ 
+ 	for (i = 0; i < MaxWalSenders; i++)
+ 	{
+ 		/* use volatile pointer to prevent code rearrangement */
+ 		volatile WalSnd	*walsnd = &WalSndCtl->walsnds[i];
+ 		XLogRecPtr	oldptr;
+ 		pid_t	walsndpid;
+ 
+ 		SpinLockAcquire(&walsnd->mutex);
+ 		oldptr = walsnd->sentPtr;
+ 		walsndpid = walsnd->pid;
+ 		SpinLockRelease(&walsnd->mutex);
+ 
+ 		if (walsndpid == 0)
+ 			continue;
+ 
+ 		/* 
+ 		 * Check to see whether this standby has fallen behind more than
+ 		 * the upper limit.
+ 		 *
+ 		 * This code is based on XLogCheckpointNeeded().
+ 		 */
+ 		old_segno = (oldptr.xlogid % XLogSegSize) * XLogSegsPerFile +
+ 			(oldptr.xrecoff / XLogSegSize);
+ 		old_highbits = oldptr.xlogid / XLogSegSize;
+ 		if (!found)
+ 		{
+ 			new_segno = (newptr.xlogid % XLogSegSize) * XLogSegsPerFile +
+ 				(newptr.xrecoff / XLogSegSize);
+ 			new_highbits = newptr.xlogid / XLogSegSize;
+ 			found = true;
+ 		}
+ 
+ 		if ((new_highbits != old_highbits ||
+ 			 new_segno >= old_segno + (uint32) (RepLagSegs - 1)) &&
+ 			walsndpid == walsnd->pid)
+ 			kill(walsndpid, SIGTERM);
+ 		/*
+ 		 * XXX: Should we recycle (or remove) old log files here? Otherwise
+ 		 * the number of log files would continue to increase until the next
+ 		 * checkpoint has recycled them. Which increases the chance of a disk
+ 		 * full failure. But it's not good to recycle log files during
+ 		 * acquiring WALWriteLock.
+ 		 */
+ 
+ 		/* standby is keeping up well */
+ 	}
+ }
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1721,1726 **** static struct config_int ConfigureNamesInt[] =
--- 1721,1735 ----
  	},
  
  	{
+ 		{"replication_lag_segments", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum number of WAL files held for the standby server."),
+ 			NULL
+ 		},
+ 		&RepLagSegs,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 189,194 ****
--- 189,195 ----
  
  #max_wal_senders = 0		# max number of walsender processes
  #wal_sender_delay = 200ms	# 1-10000 milliseconds
+ #replication_lag_segments = 0	# in logfile segments, 16MB each; 0 disables
  
  
  #------------------------------------------------------------------------------
*** a/src/include/access/xlog_internal.h
--- b/src/include/access/xlog_internal.h
***************
*** 151,156 **** typedef XLogLongPageHeaderData *XLogLongPageHeader;
--- 151,170 ----
  		} \
  	} while (0)
  
+ /* Decrement an xlogid/segment pair by segs */
+ #define PrevLogSegs(logId, logSeg, segs)	\
+ 	do {	\
+ 		logId	-= segs / XLogSegsPerFile;	\
+ 		logSeg	-= segs % XLogSegsPerFile;	\
+ 		if (logSeg < 0)	\
+ 		{	\
+ 			logId--;	\
+ 			logSeg	+= XLogSegsPerFile;	\
+ 		}	\
+ 		if (logId < 0)	\
+ 			logId = logSeg = 0;	\
+ 	} while (0)
+ 
  /* Align a record pointer to next page */
  #define NextLogPage(recptr)	\
  	do {	\
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 39,49 **** extern bool	am_walsender;
--- 39,51 ----
  
  /* user-settable parameters */
  extern int	WalSndDelay;
+ extern int	RepLagSegs;
  
  extern int WalSenderMain(void);
  extern void WalSndSignals(void);
  extern Size WalSndShmemSize(void);
  extern void WalSndShmemInit(void);
  extern XLogRecPtr GetOldestWALSendPointer(void);
+ extern void CheckStandbysDelay(XLogRecPtr newptr);
  
  #endif	/* _WALSENDER_H */
#2Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#1)
Re: Streaming replication and a disk full in primary

Fujii Masao wrote:

If the primary has a connected standby, the WAL files required for
the standby cannot be deleted. So if it has fallen too far behind
for some reasons, a disk full failure might occur on the primary.
This is one of the problems that should be fixed for v9.0.

We can cope with that case by carefully monitoring the standby lag.
In addition to this, I think that we should put an upper limit on
the number of WAL files held in pg_xlog for the standby (i.e.,
the maximum delay of the standby) as a safeguard against a disk
full error.

The attached patch introduces new GUC 'replication_lag_segments'
which specifies the maximum number of WAL files held in pg_xlog
to send to the standby. The replication to the standby which
falls more than the upper limit behind is automatically terminated,
which would avoid a disk full erro on the primary.

Thanks!

I don't think we should do the check XLogWrite(). There's really no
reason to kill the standby connections before the next checkpoint, when
the old WAL files are recycled. XLogWrite() is in the critical path of
normal operations, too.

There's another important reason for that: If archiving is not working
for some reason, the standby can't obtain the old segments from the
archive either. If we refuse to stream such old segments, and they're
not getting archived, the standby has no way to catch up until archiving
is fixed. Allowing streaming of such old segments is free wrt. disk
space, because we're keeping the files around anyway.

Walreceiver will get an error if it tries to open a segment that's been
deleted or recycled already. The dangerous situation we need to avoid is
when walreceiver holds a file open while bgwriter recycles it.
Walreceiver will merrily continue streaming data from it, even though
it's be overwritten by new data already.

A straightforward fix is to keep an "newest recycled XLogRecPtr" in
shared memory that RemoveOldXlogFiles() updates. Walreceiver checks it
right after read()ing from a file, before sending it to the client, and
throws an error if the data it read() was already recycled.

Or you could do it entirely in walreceiver, by calling fstat() on the
open file instead of checking the variable in shared memory. If the
filename isn't what you expect, indicating that it's been recycled,
throw an error. But that needs an extra fstat() call for every read().

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#3Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#2)
Re: Streaming replication and a disk full in primary

Thanks for the review! And, sorry for the delay.

On Thu, Jan 21, 2010 at 11:10 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

I don't think we should do the check XLogWrite(). There's really no
reason to kill the standby connections before the next checkpoint, when
the old WAL files are recycled. XLogWrite() is in the critical path of
normal operations, too.

OK. I'll remove that check from XLogWrite().

There's another important reason for that: If archiving is not working
for some reason, the standby can't obtain the old segments from the
archive either. If we refuse to stream such old segments, and they're
not getting archived, the standby has no way to catch up until archiving
is fixed. Allowing streaming of such old segments is free wrt. disk
space, because we're keeping the files around anyway.

OK. We should terminate the walsender whose currently-opened WAL file
has been already archived, isn't required for crash recovery AND is
'max-lag' older than the currently-written one. I'll change so.

Walreceiver will get an error if it tries to open a segment that's been
deleted or recycled already. The dangerous situation we need to avoid is
when walreceiver holds a file open while bgwriter recycles it.
Walreceiver will merrily continue streaming data from it, even though
it's be overwritten by new data already.

s/walreceiver/walsender ?

Yes, that's the problem that I'll have to fix.

A straightforward fix is to keep an "newest recycled XLogRecPtr" in
shared memory that RemoveOldXlogFiles() updates. Walreceiver checks it
right after read()ing from a file, before sending it to the client, and
throws an error if the data it read() was already recycled.

I prefer this. But I don't think such an aggressive check of a "newest
recycled XLogRecPtr" is required if the bgwriter always doesn't delete
the WAL file which is newer than or equal to the walsenders' oldest WAL
file. In other words, the WAL files which the walsender is reading (or
will read) are not removed at the moment.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#4Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#3)
1 attachment(s)
Re: Streaming replication and a disk full in primary

This task has been languishing for a long time, so I took a shot at it.
I took the approach I suggested before, keeping a variable in shared
memory to track the latest removed WAL segment. After walsender has read
a bunch of WAL records from a WAL file, it checks that what it read is
after the latest removed WAL segment, otherwise the data it read might
have came from a file that was already recycled and overwritten with new
data, and an error is thrown.

This changes the behavior so that if a standby server doing streaming
replication falls behind too much, the primary will remove/recycle a WAL
segment needed by the standby server. The previous behavior was that WAL
segments still needed by any connected standby server were never
removed, at the risk of filling the disk in the primary if a standby
server behaves badly.

In your version of this patch, the default was still the current
behavior where the primary retains WAL files that are still needed by
connected stadby servers indefinitely. I think that's a dangerous
default, so I changed it so that if you don't set standby_keep_segments,
the primary doesn't retain any extra segments; the number of WAL
segments available for standby servers is determined only by the
location of the previous checkpoint, and the status of WAL archiving.
That makes the code a bit simpler too, as we never care how far the
walsenders are. In fact, the GetOldestWALSenderPointer() function is now
dead code.

Fujii Masao wrote:

Thanks for the review! And, sorry for the delay.

On Thu, Jan 21, 2010 at 11:10 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

I don't think we should do the check XLogWrite(). There's really no
reason to kill the standby connections before the next checkpoint, when
the old WAL files are recycled. XLogWrite() is in the critical path of
normal operations, too.

OK. I'll remove that check from XLogWrite().

There's another important reason for that: If archiving is not working
for some reason, the standby can't obtain the old segments from the
archive either. If we refuse to stream such old segments, and they're
not getting archived, the standby has no way to catch up until archiving
is fixed. Allowing streaming of such old segments is free wrt. disk
space, because we're keeping the files around anyway.

OK. We should terminate the walsender whose currently-opened WAL file
has been already archived, isn't required for crash recovery AND is
'max-lag' older than the currently-written one. I'll change so.

Walreceiver will get an error if it tries to open a segment that's been
deleted or recycled already. The dangerous situation we need to avoid is
when walreceiver holds a file open while bgwriter recycles it.
Walreceiver will merrily continue streaming data from it, even though
it's be overwritten by new data already.

s/walreceiver/walsender ?

Yes, that's the problem that I'll have to fix.

A straightforward fix is to keep an "newest recycled XLogRecPtr" in
shared memory that RemoveOldXlogFiles() updates. Walreceiver checks it
right after read()ing from a file, before sending it to the client, and
throws an error if the data it read() was already recycled.

I prefer this. But I don't think such an aggressive check of a "newest
recycled XLogRecPtr" is required if the bgwriter always doesn't delete
the WAL file which is newer than or equal to the walsenders' oldest WAL
file. In other words, the WAL files which the walsender is reading (or
will read) are not removed at the moment.

Regards,

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

Attachments:

standby_keep_segments-1.patchtext/x-diff; name=standby_keep_segments-1.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1823,1828 **** archive_command = 'copy "%p" "C:\\server\\archivedir\\%f"'  # Windows
--- 1823,1856 ----
         </para>
         </listitem>
        </varlistentry>
+ 
+       <varlistentry id="guc-replication-lag-segments" xreflabel="replication_lag_segments">
+        <term><varname>standby_keep_segments</varname> (<type>integer</type>)</term>
+        <indexterm>
+         <primary><varname>standby_keep_segments</> configuration parameter</primary>
+        </indexterm>
+        <listitem>
+        <para>
+         Specifies the number of log file segments kept in <filename>pg_xlog</>
+         directory, in case a standby server needs to fetch them via streaming
+         replciation. Each segment is normally 16 megabytes. If a standby
+         server connected to the primary falls behind more than
+         <varname>standby_keep_segments</> segments, the primary might remove
+         a WAL segment still needed by the standby and the replication
+         connection will be terminated.
+ 
+         This sets only the minimum number of segments retained for standby
+         purposes, the system might need to retain more segments for WAL
+         archival or to recover from a checkpoint. If standby_keep_segments
+         is zero (the default), the system doesn't keep any extra segments
+         for standby purposes, and the number of old WAL segments available
+         for standbys is determined based only on the location of the previous
+         checkpoint and status of WAL archival.
+         This parameter can only be set in the <filename>postgresql.conf</>
+         file or on the server command line.
+        </para>
+        </listitem>
+       </varlistentry>
       </variablelist>
      </sect2>
      <sect2 id="runtime-config-standby">
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 66,71 ****
--- 66,72 ----
  
  /* User-settable parameters */
  int			CheckPointSegments = 3;
+ int			StandbySegments = 0;
  int			XLOGbuffers = 8;
  int			XLogArchiveTimeout = 0;
  bool		XLogArchiveMode = false;
***************
*** 356,361 **** typedef struct XLogCtlData
--- 357,364 ----
  	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
  	TransactionId ckptXid;
  	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
+ 	uint32		lastRemovedLog;	/* latest removed/recycled XLOG segment */
+ 	uint32		lastRemovedSeg;
  
  	/* Protected by WALWriteLock: */
  	XLogCtlWrite Write;
***************
*** 3150,3155 **** PreallocXlogFiles(XLogRecPtr endptr)
--- 3153,3174 ----
  }
  
  /*
+  * Get the log/seg of the latest removed or recycled WAL segment.
+  * Returns 0 if no WAL segments have been removed since startup.
+  */
+ void
+ XLogGetLastRemoved(uint32 *log, uint32 *seg)
+ {
+ 	/* use volatile pointer to prevent code rearrangement */
+ 	volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+ 	SpinLockAcquire(&xlogctl->info_lck);
+ 	*log = xlogctl->lastRemovedLog;
+ 	*seg = xlogctl->lastRemovedSeg;
+ 	SpinLockRelease(&xlogctl->info_lck);
+ }
+ 
+ /*
   * Recycle or remove all log files older or equal to passed log/seg#
   *
   * endptr is current (or recent) end of xlog; this is used to determine
***************
*** 3170,3175 **** RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
--- 3189,3208 ----
  	char		newpath[MAXPGPATH];
  #endif
  	struct stat statbuf;
+ 	/* use volatile pointer to prevent code rearrangement */
+ 	volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+ 	/* Update the last removed location in shared memory first */
+ 	SpinLockAcquire(&xlogctl->info_lck);
+ 	if (log > xlogctl->lastRemovedLog ||
+ 		(log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
+ 	{
+ 		xlogctl->lastRemovedLog = log;
+ 		xlogctl->lastRemovedSeg = seg;
+ 	}
+ 	SpinLockRelease(&xlogctl->info_lck);
+ 
+ 	elog(DEBUG1, "removing WAL segments older than %X/%X", log, seg);
  
  	/*
  	 * Initialize info about where to try to recycle to.  We allow recycling
***************
*** 7101,7136 **** CreateCheckPoint(int flags)
  	smgrpostckpt();
  
  	/*
! 	 * If there's connected standby servers doing XLOG streaming, don't delete
! 	 * XLOG files that have not been streamed to all of them yet. This does
! 	 * nothing to prevent them from being deleted when the standby is
! 	 * disconnected (e.g because of network problems), but at least it avoids
! 	 * an open replication connection from failing because of that.
  	 */
! 	if ((_logId || _logSeg) && max_wal_senders > 0)
  	{
! 		XLogRecPtr	oldest;
! 		uint32		log;
! 		uint32		seg;
! 
! 		oldest = GetOldestWALSendPointer();
! 		if (oldest.xlogid != 0 || oldest.xrecoff != 0)
  		{
! 			XLByteToSeg(oldest, log, seg);
  			if (log < _logId || (log == _logId && seg < _logSeg))
  			{
  				_logId = log;
  				_logSeg = seg;
  			}
  		}
- 	}
  
- 	/*
- 	 * Delete old log files (those no longer needed even for previous
- 	 * checkpoint or the standbys in XLOG streaming).
- 	 */
- 	if (_logId || _logSeg)
- 	{
  		PrevLogSeg(_logId, _logSeg);
  		RemoveOldXlogFiles(_logId, _logSeg, recptr);
  	}
--- 7134,7184 ----
  	smgrpostckpt();
  
  	/*
! 	 * Delete old log files (those no longer needed even for previous
! 	 * checkpoint or the standbys in XLOG streaming).
  	 */
! 	if (_logId || _logSeg)
  	{
! 		/*
! 		 * Calculate the last segment that we need to retain because of
! 		 * standby_keep_segments, by subtracting StandbySegments from the
! 		 * new checkpoint location.
! 		 */
! 		if (StandbySegments > 0)
  		{
! 			uint32		log;
! 			uint32		seg;
! 			int			d_log;
! 			int			d_seg;
! 
! 			XLByteToSeg(recptr, log, seg);
! 
! 			d_seg = StandbySegments % XLogSegsPerFile;
! 			d_log = StandbySegments / XLogSegsPerFile;
! 			if (seg < d_seg)
! 			{
! 				d_log += 1;
! 				seg = seg - d_seg + XLogSegsPerFile;
! 			}
! 			else
! 				seg = seg - d_seg;
! 			/* avoid underflow, don't go below (0,1) */
! 			if (log < d_log || (log == d_log && seg == 0))
! 			{
! 				log = 0;
! 				seg = 1;
! 			}
! 			else
! 				log = log - d_log;
! 
! 			/* don't delete WAL segments newer than the calculated segment */
  			if (log < _logId || (log == _logId && seg < _logSeg))
  			{
  				_logId = log;
  				_logSeg = seg;
  			}
  		}
  
  		PrevLogSeg(_logId, _logSeg);
  		RemoveOldXlogFiles(_logId, _logSeg, recptr);
  	}
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 508,513 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
--- 508,517 ----
  {
  	char		path[MAXPGPATH];
  	uint32		startoff;
+ 	uint32		lastRemovedLog;
+ 	uint32		lastRemovedSeg;
+ 	uint32		log;
+ 	uint32		seg;
  
  	while (nbytes > 0)
  	{
***************
*** 527,536 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
  
  			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
  			if (sendFile < 0)
! 				ereport(FATAL,	/* XXX: Why FATAL? */
! 						(errcode_for_file_access(),
! 						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
! 								path, sendId, sendSeg)));
  			sendOff = 0;
  		}
  
--- 531,557 ----
  
  			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
  			if (sendFile < 0)
! 			{
! 				/*
! 				 * If the file is not found, assume it's because the
! 				 * standby asked for a too old WAL segment that has already
! 				 * been removed or recycled.
! 				 */
! 				if (errno == ENOENT)
! 				{
! 					char filename[MAXFNAMELEN];
! 					XLogFileName(filename, ThisTimeLineID, sendId, sendSeg);
! 					ereport(ERROR,
! 							(errcode_for_file_access(),
! 							 errmsg("requested WAL segment %s has already been removed",
! 									filename)));
! 				}
! 				else
! 					ereport(ERROR,
! 							(errcode_for_file_access(),
! 							 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
! 									path, sendId, sendSeg)));
! 			}
  			sendOff = 0;
  		}
  
***************
*** 538,544 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
  		if (sendOff != startoff)
  		{
  			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
! 				ereport(FATAL,
  						(errcode_for_file_access(),
  						 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
  								sendId, sendSeg, startoff)));
--- 559,565 ----
  		if (sendOff != startoff)
  		{
  			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
! 				ereport(ERROR,
  						(errcode_for_file_access(),
  						 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
  								sendId, sendSeg, startoff)));
***************
*** 553,559 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
  
  		readbytes = read(sendFile, buf, segbytes);
  		if (readbytes <= 0)
! 			ereport(FATAL,
  					(errcode_for_file_access(),
  			errmsg("could not read from log file %u, segment %u, offset %u, "
  				   "length %lu: %m",
--- 574,580 ----
  
  		readbytes = read(sendFile, buf, segbytes);
  		if (readbytes <= 0)
! 			ereport(ERROR,
  					(errcode_for_file_access(),
  			errmsg("could not read from log file %u, segment %u, offset %u, "
  				   "length %lu: %m",
***************
*** 566,571 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
--- 587,612 ----
  		nbytes -= readbytes;
  		buf += readbytes;
  	}
+ 
+ 	/*
+ 	 * After reading into the buffer, check that what we read was valid.
+ 	 * We do this after reading, because even though the segment was present
+ 	 * when we opened it, it might get recycled or removed while we read it.
+ 	 * The read() succeeds in that case, but the data we tried to read might
+ 	 * already have been overwritten with new WAL records.
+ 	 */
+ 	XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg);
+ 	XLByteToPrevSeg(recptr, log, seg);
+ 	if (log < lastRemovedLog ||
+ 		(log == lastRemovedLog && seg <= lastRemovedSeg))
+ 	{
+ 		char filename[MAXFNAMELEN];
+ 		XLogFileName(filename, ThisTimeLineID, log, seg);
+ 		ereport(ERROR,
+ 				(errcode_for_file_access(),
+ 				 errmsg("requested WAL segment %s has already been removed",
+ 						filename)));
+ 	}
  }
  
  /*
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1648,1653 **** static struct config_int ConfigureNamesInt[] =
--- 1648,1662 ----
  	},
  
  	{
+ 		{"standby_keep_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the number of WAL files held for standby servers"),
+ 			NULL
+ 		},
+ 		&StandbySegments,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"checkpoint_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
  			gettext_noop("Sets the maximum distance in log segments between automatic WAL checkpoints."),
  			NULL
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 193,198 ****
--- 193,199 ----
  
  #max_wal_senders = 0		# max number of walsender processes
  #wal_sender_delay = 200ms	# 1-10000 milliseconds
+ #standby_keep_segments = 0	# in logfile segments, 16MB each; 0 disables
  
  
  #------------------------------------------------------------------------------
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 187,192 **** extern XLogRecPtr XactLastRecEnd;
--- 187,193 ----
  
  /* these variables are GUC parameters related to XLOG */
  extern int	CheckPointSegments;
+ extern int	StandbySegments;
  extern int	XLOGbuffers;
  extern bool XLogArchiveMode;
  extern char *XLogArchiveCommand;
***************
*** 267,272 **** extern int XLogFileInit(uint32 log, uint32 seg,
--- 268,274 ----
  extern int	XLogFileOpen(uint32 log, uint32 seg);
  
  
+ extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
  extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
  
  extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
#5Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#4)
Re: Streaming replication and a disk full in primary

On Wed, Apr 7, 2010 at 6:02 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

This task has been languishing for a long time, so I took a shot at it.
I took the approach I suggested before, keeping a variable in shared
memory to track the latest removed WAL segment. After walsender has read
a bunch of WAL records from a WAL file, it checks that what it read is
after the latest removed WAL segment, otherwise the data it read might
have came from a file that was already recycled and overwritten with new
data, and an error is thrown.

This changes the behavior so that if a standby server doing streaming
replication falls behind too much, the primary will remove/recycle a WAL
segment needed by the standby server. The previous behavior was that WAL
segments still needed by any connected standby server were never
removed, at the risk of filling the disk in the primary if a standby
server behaves badly.

In your version of this patch, the default was still the current
behavior where the primary retains WAL files that are still needed by
connected stadby servers indefinitely. I think that's a dangerous
default, so I changed it so that if you don't set standby_keep_segments,
the primary doesn't retain any extra segments; the number of WAL
segments available for standby servers is determined only by the
location of the previous checkpoint, and the status of WAL archiving.
That makes the code a bit simpler too, as we never care how far the
walsenders are. In fact, the GetOldestWALSenderPointer() function is now
dead code.

This seems like a very useful feature, but I can't speak to the code
quality without a good deal more study.

...Robert

#6Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#4)
Re: Streaming replication and a disk full in primary

Thanks for the great patch! I apologize for leaving the issue
half-finished for long time :(

On Wed, Apr 7, 2010 at 7:02 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

In your version of this patch, the default was still the current
behavior where the primary retains WAL files that are still needed by
connected stadby servers indefinitely. I think that's a dangerous
default, so I changed it so that if you don't set standby_keep_segments,
the primary doesn't retain any extra segments; the number of WAL
segments available for standby servers is determined only by the
location of the previous checkpoint, and the status of WAL archiving.
That makes the code a bit simpler too, as we never care how far the
walsenders are. In fact, the GetOldestWALSenderPointer() function is now
dead code.

It's OK for me to change the default behavior. We can remove
the GetOldestWALSenderPointer() function.

doc/src/sgml/config.sgml
-        archival or to recover from a checkpoint. If standby_keep_segments
+        archival or to recover from a checkpoint. If
<varname>standby_keep_segments</>

The word "standby_keep_segments" always needs the <varname> tag, I think.

We should remove the document "25.2.5.2. Monitoring"?

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

When XLogRead() reads two WAL files and only the older of them is recycled
during being read, it might fail in checking whether the read data is valid.
This is because the variable "recptr" can advance to the newer WAL file
before the check.

When walreceiver has gotten stuck for some reason, walsender would be
unable to pass through the send() system call, and also get stuck.
In the patch, such a walsender cannot exit forever because it cannot
call XLogRead(). So I think that the bgwriter needs to send the
exit-signal to such a too lagged walsender. Thought?

The shmem of latest recycled WAL file is updated before checking whether
it's already been archived. If archiving is not working for some reason,
the WAL file which that shmem indicates might not actually have been
recycled yet. In this case, the standby cannot obtain the WAL file from
the primary because it's been marked as "latest recycled", and from the
archive because it's not been archived yet. This seems to be a big problem.
How about moving the update of the shmem to after calling XLogArchiveCheckDone()
in RemoveOldXlogFiles()?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#7Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#6)
Re: Streaming replication and a disk full in primary

Fujii Masao wrote:

doc/src/sgml/config.sgml
-        archival or to recover from a checkpoint. If standby_keep_segments
+        archival or to recover from a checkpoint. If
<varname>standby_keep_segments</>

The word "standby_keep_segments" always needs the <varname> tag, I think.

Thanks, fixed.

We should remove the document "25.2.5.2. Monitoring"?

I updated it to no longer claim that the primary can run out of disk
space because of a hung WAL sender. The information about calculating
the lag between primary and standby still seems valuable, so I didn't
remove the whole section.

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

True. I don't think we should second guess the admin on that, though.
Perhaps he only set max_wal_senders=0 temporarily, and will be
disappointed if the the logs are no longer there when he sets it back to
non-zero and restarts the server.

When XLogRead() reads two WAL files and only the older of them is recycled
during being read, it might fail in checking whether the read data is valid.
This is because the variable "recptr" can advance to the newer WAL file
before the check.

Thanks, fixed.

When walreceiver has gotten stuck for some reason, walsender would be
unable to pass through the send() system call, and also get stuck.
In the patch, such a walsender cannot exit forever because it cannot
call XLogRead(). So I think that the bgwriter needs to send the
exit-signal to such a too lagged walsender. Thought?

Any backend can get stuck like that.

The shmem of latest recycled WAL file is updated before checking whether
it's already been archived. If archiving is not working for some reason,
the WAL file which that shmem indicates might not actually have been
recycled yet. In this case, the standby cannot obtain the WAL file from
the primary because it's been marked as "latest recycled", and from the
archive because it's not been archived yet. This seems to be a big problem.
How about moving the update of the shmem to after calling XLogArchiveCheckDone()
in RemoveOldXlogFiles()?

Good point. It's particularly important considering that if a segment
hasn't been archived yet, it's not available to the standby from the
archive either. I changed that.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#8Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#7)
Re: Streaming replication and a disk full in primary

On Mon, Apr 12, 2010 at 7:41 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

We should remove the document "25.2.5.2. Monitoring"?

I updated it to no longer claim that the primary can run out of disk
space because of a hung WAL sender. The information about calculating
the lag between primary and standby still seems valuable, so I didn't
remove the whole section.

Yes.

! An important health indicator of streaming replication is the amount
! of WAL records generated in the primary, but not yet applied in the
! standby.

Since pg_last_xlog_receive_location doesn't let us know the WAL location
not yet applied, we should use pg_last_xlog_replay_location instead. How
How about?:

----------------
      An important health indicator of streaming replication is the amount
      of WAL records generated in the primary, but not yet applied in the
      standby. You can calculate this lag by comparing the current WAL write
-     location on the primary with the last WAL location received by the
+     location on the primary with the last WAL location replayed by the
      standby. They can be retrieved using
      <function>pg_current_xlog_location</> on the primary and the
-     <function>pg_last_xlog_receive_location</> on the standby,
+     <function>pg_last_xlog_replay_location</> on the standby,
      respectively (see <xref linkend="functions-admin-backup-table"> and
      <xref linkend="functions-recovery-info-table"> for details).
-     The last WAL receive location in the standby is also displayed in the
-     process status of the WAL receiver process, displayed using the
-     <command>ps</> command (see <xref linkend="monitoring-ps"> for details).
     </para>
    </sect3>
----------------

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

True. I don't think we should second guess the admin on that, though.
Perhaps he only set max_wal_senders=0 temporarily, and will be
disappointed if the the logs are no longer there when he sets it back to
non-zero and restarts the server.

OK. Since the behavior is not intuitive for me, I'd like to add the note
into the end of the description about "standby_keep_segments". How about?:

----------------
This setting has effect if max_wal_senders is zero.
----------------

When walreceiver has gotten stuck for some reason, walsender would be
unable to pass through the send() system call, and also get stuck.
In the patch, such a walsender cannot exit forever because it cannot
call XLogRead(). So I think that the bgwriter needs to send the
exit-signal to such a too lagged walsender. Thought?

Any backend can get stuck like that.

OK.

+ 	},
+
+ 	{
+ 		{"standby_keep_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
+ 			gettext_noop("Sets the number of WAL files held for standby servers"),
+ 			NULL
+ 		},
+ 		&StandbySegments,
+ 		0, 0, INT_MAX, NULL, NULL

We should s/WAL_CHECKPOINTS/WAL_REPLICATION ?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#9Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#7)
Re: Streaming replication and a disk full in primary

On Mon, Apr 12, 2010 at 6:41 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

True. I don't think we should second guess the admin on that, though.
Perhaps he only set max_wal_senders=0 temporarily, and will be
disappointed if the the logs are no longer there when he sets it back to
non-zero and restarts the server.

If archive_mode is off and max_wal_senders = 0, then the WAL that's
being generated won't be usable for streaming anyway, right?

I think this is another manifestation of the problem I was complaining
about over the weekend: there's no longer a single GUC that controls
what type of information we emit as WAL. In previous releases,
archive_mode served that function, but now it's much more complicated
and, IMHO, not very comprehensible.

http://archives.postgresql.org/pgsql-hackers/2010-04/msg00509.php

...Robert

#10Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Robert Haas (#9)
Re: Streaming replication and a disk full in primary

Robert Haas wrote:

On Mon, Apr 12, 2010 at 6:41 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

True. I don't think we should second guess the admin on that, though.
Perhaps he only set max_wal_senders=0 temporarily, and will be
disappointed if the the logs are no longer there when he sets it back to
non-zero and restarts the server.

If archive_mode is off and max_wal_senders = 0, then the WAL that's
being generated won't be usable for streaming anyway, right?

I think this is another manifestation of the problem I was complaining
about over the weekend: there's no longer a single GUC that controls
what type of information we emit as WAL. In previous releases,
archive_mode served that function, but now it's much more complicated
and, IMHO, not very comprehensible.

http://archives.postgresql.org/pgsql-hackers/2010-04/msg00509.php

Agreed. We've been trying to deduce from other settings what information
needs to be WAL-logged, but it hasn't been a great success so it would
be better to make it explicit than try to hide it.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#11Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#10)
Re: Streaming replication and a disk full in primary

On Tue, Apr 13, 2010 at 11:56 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Robert Haas wrote:

On Mon, Apr 12, 2010 at 6:41 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Why is standby_keep_segments used even if max_wal_senders is zero?
In that case, ISTM we don't need to keep any WAL files in pg_xlog
for the standby.

True. I don't think we should second guess the admin on that, though.
Perhaps he only set max_wal_senders=0 temporarily, and will be
disappointed if the the logs are no longer there when he sets it back to
non-zero and restarts the server.

If archive_mode is off and max_wal_senders = 0, then the WAL that's
being generated won't be usable for streaming anyway, right?

I think this is another manifestation of the problem I was complaining
about over the weekend: there's no longer a single GUC that controls
what type of information we emit as WAL.  In previous releases,
archive_mode served that function, but now it's much more complicated
and, IMHO, not very comprehensible.

http://archives.postgresql.org/pgsql-hackers/2010-04/msg00509.php

Agreed. We've been trying to deduce from other settings what information
needs to be WAL-logged, but it hasn't been a great success so it would
be better to make it explicit than try to hide it.

I've realized another problem with this patch. standby_keep_segments
only controls the number of segments that we keep around for purposes
of streaming: it doesn't affect archiving at all. And of course, a
standby server based on archiving is every bit as much of a standby
server as one that uses streaming replication. So at a minimum, the
name of this GUC is very confusing. We should also probably think a
little bit about why we feel like it's OK to throw away data that is
needed for SR to work, but we don't feel like we ever want to throw
away WAL segments that we can't manage to archive.

In the department of minor nits, I also don't like the fact that the
GUC is called standby_keep_segments and the variable is called
StandbySegments. If we really have to capitalize them differently, we
should at least make it StandbyKeepSegments, but personally I think we
should use standby_keep_segments in both places so that it doesn't
take quite so many greps to find all the references.

...Robert

#12Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Robert Haas (#11)
Re: Streaming replication and a disk full in primary

Robert Haas wrote:

I've realized another problem with this patch. standby_keep_segments
only controls the number of segments that we keep around for purposes
of streaming: it doesn't affect archiving at all. And of course, a
standby server based on archiving is every bit as much of a standby
server as one that uses streaming replication. So at a minimum, the
name of this GUC is very confusing.

Hmm, I guess streaming_keep_segments would be more accurate. Somehow
doesn't feel as good otherwise, though. Any other suggestions?

We should also probably think a
little bit about why we feel like it's OK to throw away data that is
needed for SR to work, but we don't feel like we ever want to throw
away WAL segments that we can't manage to archive.

Failure to archive is considered more serious, because your continuous
archiving backup becomes invalid if we delete a segment before it's
archived. And a streaming standby server can catch up using the archive
if it falls behind too much. Plus the primary doesn't know how many
standby servers there is, so it doesn't know which segments are still
needed for SR.

In the department of minor nits, I also don't like the fact that the
GUC is called standby_keep_segments and the variable is called
StandbySegments. If we really have to capitalize them differently, we
should at least make it StandbyKeepSegments, but personally I think we
should use standby_keep_segments in both places so that it doesn't
take quite so many greps to find all the references.

Well, it's consistent with checkpoint_segments/CheckPointSegments. There
is no consistent style on naming the global variables behind GUCs. If
you feel like changing it though, I won't object.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#13Alvaro Herrera
alvherre@commandprompt.com
In reply to: Robert Haas (#11)
Re: Streaming replication and a disk full in primary

Robert Haas escribi�:

In the department of minor nits, I also don't like the fact that the
GUC is called standby_keep_segments and the variable is called
StandbySegments. If we really have to capitalize them differently, we
should at least make it StandbyKeepSegments, but personally I think we
should use standby_keep_segments in both places so that it doesn't
take quite so many greps to find all the references.

+1, using both names capitalized identically makes the code easier to navigate.

--
Alvaro Herrera http://www.CommandPrompt.com/
The PostgreSQL Company - Command Prompt, Inc.

#14Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#12)
Re: Streaming replication and a disk full in primary

On Thu, Apr 15, 2010 at 2:54 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Robert Haas wrote:

I've realized another problem with this patch.  standby_keep_segments
only controls the number of segments that we keep around for purposes
of streaming: it doesn't affect archiving at all.  And of course, a
standby server based on archiving is every bit as much of a standby
server as one that uses streaming replication.  So at a minimum, the
name of this GUC is very confusing.

Hmm, I guess streaming_keep_segments would be more accurate. Somehow
doesn't feel as good otherwise, though. Any other suggestions?

I sort of feel like the correct description is something like
num_extra_retained_wal_segments, but that's sort of long. The actual
behavior is not tied to streaming, although the use case is.

...Robert

#15Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#14)
Re: Streaming replication and a disk full in primary

On Thu, Apr 15, 2010 at 6:13 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Apr 15, 2010 at 2:54 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Robert Haas wrote:

I've realized another problem with this patch.  standby_keep_segments
only controls the number of segments that we keep around for purposes
of streaming: it doesn't affect archiving at all.  And of course, a
standby server based on archiving is every bit as much of a standby
server as one that uses streaming replication.  So at a minimum, the
name of this GUC is very confusing.

Hmm, I guess streaming_keep_segments would be more accurate. Somehow
doesn't feel as good otherwise, though. Any other suggestions?

I sort of feel like the correct description is something like
num_extra_retained_wal_segments, but that's sort of long.  The actual
behavior is not tied to streaming, although the use case is.

<thinks more>

How about wal_keep_segments?

...Robert

#16Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#15)
1 attachment(s)
Re: Streaming replication and a disk full in primary

On Fri, Apr 16, 2010 at 9:47 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Apr 15, 2010 at 6:13 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Thu, Apr 15, 2010 at 2:54 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

Robert Haas wrote:

I've realized another problem with this patch.  standby_keep_segments
only controls the number of segments that we keep around for purposes
of streaming: it doesn't affect archiving at all.  And of course, a
standby server based on archiving is every bit as much of a standby
server as one that uses streaming replication.  So at a minimum, the
name of this GUC is very confusing.

Hmm, I guess streaming_keep_segments would be more accurate. Somehow
doesn't feel as good otherwise, though. Any other suggestions?

I sort of feel like the correct description is something like
num_extra_retained_wal_segments, but that's sort of long.  The actual
behavior is not tied to streaming, although the use case is.

<thinks more>

How about wal_keep_segments?

Here's the patch.

...Robert

Attachments:

wal_keep_segments.patchapplication/octet-stream; name=wal_keep_segments.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0c3e076..1bea0d9 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1842,10 +1842,10 @@ SET ENABLE_SEQSCAN TO OFF;
        </listitem>
       </varlistentry>
 
-      <varlistentry id="guc-standby-keep-segments" xreflabel="standby_keep_segments">
-       <term><varname>standby_keep_segments</varname> (<type>integer</type>)</term>
+      <varlistentry id="guc-wal-keep-segments" xreflabel="wal_keep_segments">
+       <term><varname>wal_keep_segments</varname> (<type>integer</type>)</term>
        <indexterm>
-        <primary><varname>standby_keep_segments</> configuration parameter</primary>
+        <primary><varname>wal_keep_segments</> configuration parameter</primary>
        </indexterm>
        <listitem>
        <para>
@@ -1853,17 +1853,17 @@ SET ENABLE_SEQSCAN TO OFF;
         directory, in case a standby server needs to fetch them via streaming
         replication. Each segment is normally 16 megabytes. If a standby
         server connected to the primary falls behind more than
-        <varname>standby_keep_segments</> segments, the primary might remove
+        <varname>wal_keep_segments</> segments, the primary might remove
         a WAL segment still needed by the standby and the replication
         connection will be terminated.
 
         This sets only the minimum number of segments retained for standby
-        purposes, the system might need to retain more segments for WAL
-        archival or to recover from a checkpoint. If <varname>standby_keep_segments</>
+        purposes; the system might need to retain more segments for WAL
+        archival or to recover from a checkpoint. If <varname>wal_keep_segments</>
         is zero (the default), the system doesn't keep any extra segments
         for standby purposes, and the number of old WAL segments available
         for standbys is determined based only on the location of the previous
-        checkpoint and status of WAL archival.
+        checkpoint and status of WAL archiving.
         This parameter can only be set in the <filename>postgresql.conf</>
         file or on the server command line.
        </para>
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index 9c45d7b..ce62a6b 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -737,7 +737,7 @@ trigger_file = '/path/to/trigger_file'
     falls behind too much, the primary will delete old WAL files still
     needed by the standby, and the standby will have to be manually restored
     from a base backup. You can control how long the primary retains old WAL
-    segments using the <varname>standby_keep_segments</> setting.
+    segments using the <varname>wal_keep_segments</> setting.
    </para>
 
    <para>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 9b81cfa..7647f4e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -66,7 +66,7 @@
 
 /* User-settable parameters */
 int			CheckPointSegments = 3;
-int			StandbySegments = 0;
+int			wal_keep_segments = 0;
 int			XLOGbuffers = 8;
 int			XLogArchiveTimeout = 0;
 bool		XLogArchiveMode = false;
@@ -7285,10 +7285,10 @@ CreateCheckPoint(int flags)
 	{
 		/*
 		 * Calculate the last segment that we need to retain because of
-		 * standby_keep_segments, by subtracting StandbySegments from the
+		 * wal_keep_segments, by subtracting wal_keep_segments from the
 		 * new checkpoint location.
 		 */
-		if (StandbySegments > 0)
+		if (wal_keep_segments > 0)
 		{
 			uint32		log;
 			uint32		seg;
@@ -7297,8 +7297,8 @@ CreateCheckPoint(int flags)
 
 			XLByteToSeg(recptr, log, seg);
 
-			d_seg = StandbySegments % XLogSegsPerFile;
-			d_log = StandbySegments / XLogSegsPerFile;
+			d_seg = wal_keep_segments % XLogSegsPerFile;
+			d_log = wal_keep_segments / XLogSegsPerFile;
 			if (seg < d_seg)
 			{
 				d_log += 1;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c9d934f..c8e768e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1656,11 +1656,11 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"standby_keep_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
+		{"wal_keep_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
 			gettext_noop("Sets the number of WAL files held for standby servers"),
 			NULL
 		},
-		&StandbySegments,
+		&wal_keep_segments,
 		0, 0, INT_MAX, NULL, NULL
 	},
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c3f985a..92763eb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -193,7 +193,7 @@
 
 #max_wal_senders = 0		# max number of walsender processes
 #wal_sender_delay = 200ms	# 1-10000 milliseconds
-#standby_keep_segments = 0	# in logfile segments, 16MB each; 0 disables
+#wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
 
 
 #------------------------------------------------------------------------------
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 0295a61..6bfc7d5 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -187,7 +187,7 @@ extern XLogRecPtr XactLastRecEnd;
 
 /* these variables are GUC parameters related to XLOG */
 extern int	CheckPointSegments;
-extern int	StandbySegments;
+extern int	wal_keep_segments;
 extern int	XLOGbuffers;
 extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
#17Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#16)
Re: Streaming replication and a disk full in primary

On Tue, Apr 20, 2010 at 9:55 AM, Robert Haas <robertmhaas@gmail.com> wrote:

How about wal_keep_segments?

+1

Here's the patch.

Seems OK.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#18Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#17)
Re: Streaming replication and a disk full in primary

On Tue, Apr 20, 2010 at 5:53 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Apr 20, 2010 at 9:55 AM, Robert Haas <robertmhaas@gmail.com> wrote:

How about wal_keep_segments?

+1

Here's the patch.

Seems OK.

Thanks, committed.

...Robert