Replication server timeout patch

Started by Daniel Farinaalmost 15 years ago47 messages
#1Daniel Farina
daniel@heroku.com
1 attachment(s)

Hello list,

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

--
fdr

Attachments:

0001-Split-and-rename-out-server-timeouts.patchtext/x-patch; charset=US-ASCII; name=0001-Split-and-rename-out-server-timeouts.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2121,2126 **** SET ENABLE_SEQSCAN TO OFF;
--- 2121,2140 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-replication-timeout-server" xreflabel="replication_timeout_server">
+       <term><varname>replication_timeout_server</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout_server</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         If the primary server does not receive a reply from a standby server
+         within <varname>replication_timeout_server</> seconds then the
+         primary will terminate the replication connection.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       </variablelist>
      </sect2>
     </sect1>
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 73,78 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 73,80 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 200;	/* max sleep time between some actions */
+ int			replication_timeout_server; /* If the receiver takes too long, time
+ 										 * out and die after this duration */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 89,94 **** static uint32 sendOff = 0;
--- 91,99 ----
   */
  static XLogRecPtr sentPtr = {0, 0};
  
+ /* Remembers the last time the standby has notified the primary of progress */
+ static TimestampTz last_reply_timestamp;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 250,255 **** WalSndHandshake(void)
--- 255,265 ----
  						 errmsg("invalid standby handshake message type %d", firstchar)));
  		}
  	}
+ 
+ 	/*
+      * Initialize our timeout checking mechanism.
+      */
+ 	last_reply_timestamp = GetCurrentTimestamp();
  }
  
  /*
***************
*** 616,632 **** WalSndLoop(void)
  
  			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
! 				/*
! 				 * XXX: We don't really need the periodic wakeups anymore,
! 				 * WaitLatchOrSocket should reliably wake up as soon as
! 				 * something interesting happens.
! 				 */
  
  				/* Sleep */
! 				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
--- 626,650 ----
  
  			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop &&
! 				!walsender_shutdown_requested)
  			{
! 				long timeout;
! 
! 				if (replication_timeout_server == -1)
! 					timeout = -1L;
! 				else
! 					timeout = 1000000L * replication_timeout_server;
  
  				/* Sleep */
! 				if (WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 									  timeout) == 0)
! 				{
! 					ereport(LOG,
! 							(errmsg("streaming replication timeout after %d s",
! 									replication_timeout_server)));
! 					break;
! 				}
  			}
  		}
  		else
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1484,1489 **** static struct config_int ConfigureNamesInt[] =
--- 1484,1499 ----
  	},
  
  	{
+ 		{"replication_timeout_server", PGC_SIGHUP, WAL_SETTINGS,
+ 		 gettext_noop("Replication connection will timeout after this duration."),
+ 		 NULL,
+ 		 GUC_UNIT_S
+ 		},
+ 		&replication_timeout_server,
+ 		30, -1, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
  			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
  			NULL,
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 203,208 ****
--- 203,209 ----
  					# when reading streaming WAL;
  					# -1 allows indefinite delay
  #wal_receiver_status_interval = 10s	# replies at least this often, 0 disables
+ #replication_timeout_server = 120	# -1 means wait forever
  
  
  #------------------------------------------------------------------------------
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 70,75 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 70,76 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout_server;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
#2Daniel Farina
drfarina@acm.org
In reply to: Daniel Farina (#1)
1 attachment(s)

Hello list,

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

--
fdr

Attachments:

0001-Split-and-rename-out-server-timeouts.patchtext/x-patch; charset=US-ASCII; name=0001-Split-and-rename-out-server-timeouts.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2121,2126 **** SET ENABLE_SEQSCAN TO OFF;
--- 2121,2140 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-replication-timeout-server" xreflabel="replication_timeout_server">
+       <term><varname>replication_timeout_server</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout_server</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         If the primary server does not receive a reply from a standby server
+         within <varname>replication_timeout_server</> seconds then the
+         primary will terminate the replication connection.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       </variablelist>
      </sect2>
     </sect1>
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 73,78 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 73,80 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 200;	/* max sleep time between some actions */
+ int			replication_timeout_server; /* If the receiver takes too long, time
+ 										 * out and die after this duration */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 89,94 **** static uint32 sendOff = 0;
--- 91,99 ----
   */
  static XLogRecPtr sentPtr = {0, 0};
  
+ /* Remembers the last time the standby has notified the primary of progress */
+ static TimestampTz last_reply_timestamp;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 250,255 **** WalSndHandshake(void)
--- 255,265 ----
  						 errmsg("invalid standby handshake message type %d", firstchar)));
  		}
  	}
+ 
+ 	/*
+      * Initialize our timeout checking mechanism.
+      */
+ 	last_reply_timestamp = GetCurrentTimestamp();
  }
  
  /*
***************
*** 616,632 **** WalSndLoop(void)
  
  			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
! 				/*
! 				 * XXX: We don't really need the periodic wakeups anymore,
! 				 * WaitLatchOrSocket should reliably wake up as soon as
! 				 * something interesting happens.
! 				 */
  
  				/* Sleep */
! 				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
--- 626,650 ----
  
  			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop &&
! 				!walsender_shutdown_requested)
  			{
! 				long timeout;
! 
! 				if (replication_timeout_server == -1)
! 					timeout = -1L;
! 				else
! 					timeout = 1000000L * replication_timeout_server;
  
  				/* Sleep */
! 				if (WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 									  timeout) == 0)
! 				{
! 					ereport(LOG,
! 							(errmsg("streaming replication timeout after %d s",
! 									replication_timeout_server)));
! 					break;
! 				}
  			}
  		}
  		else
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1484,1489 **** static struct config_int ConfigureNamesInt[] =
--- 1484,1499 ----
  	},
  
  	{
+ 		{"replication_timeout_server", PGC_SIGHUP, WAL_SETTINGS,
+ 		 gettext_noop("Replication connection will timeout after this duration."),
+ 		 NULL,
+ 		 GUC_UNIT_S
+ 		},
+ 		&replication_timeout_server,
+ 		30, -1, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
  			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
  			NULL,
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 203,208 ****
--- 203,209 ----
  					# when reading streaming WAL;
  					# -1 allows indefinite delay
  #wal_receiver_status_interval = 10s	# replies at least this often, 0 disables
+ #replication_timeout_server = 120	# -1 means wait forever
  
  
  #------------------------------------------------------------------------------
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 70,75 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 70,76 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout_server;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
#3Robert Haas
robertmhaas@gmail.com
In reply to: Daniel Farina (#2)
Re: Replication server timeout patch

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina <drfarina@acm.org> wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

However, it looks to me like this renders wal_sender_delay aka
WalSndDelay completely unused. If we don't need that GUC any more, we
should rip it out completely.

The comment in WalSndHandshake should have a tab at the beginning of
every line. Right now the first line has a tab and the rest have
spaces.

The first hunk in WalSndLoop is a meaningless whitespace change.

I wonder if we ought to just call this replication_timeout, rather
than replication_timeout_server. Simon's patch (from which this
extracted) also has replication_timeout_client, but the two aren't
symmetrical. The replication_timeout_client in this patch is the
amount of time after which the master acknowledges the commit even
though the synchronous standby hasn't acked yet. So it only applies
to the synchronous replication case, whereas this is useful for both
synchronous and asynchronous replication. I'm inclined to think that
knob is utterly useless anyway; surely waiting more than zero will
reduce the throughput of the system to some minute fraction of its
normal value, while waiting less than infinity throws out the data
guarantee that made you pick synchronous replication in the first
place. Even if we do decide to keep that knob, I don't think we'll
want the value to be symmetric with this one.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#4Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Robert Haas (#3)
Re: Replication server timeout patch

On 11.02.2011 22:11, Robert Haas wrote:

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina<drfarina@acm.org> wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

Hmm, so this patch implements a watchdog, where the master disconnects
the standby if the heartbeat from the standby stops for more than
'replication_[server]_timeout' seconds. The standby sends the heartbeat
every wal_receiver_status_interval seconds.

It would be nice if the master and standby could negotiate those
settings. As the patch stands, it's easy to have a pathological
configuration where replication_server_timeout <
wal_receiver_status_interval, so that the master repeatedly disconnects
the standby because it doesn't reply in time. Maybe the standby should
report how often it's going to send a heartbeat, and master should wait
for that long + some safety margin. Or maybe the master should tell the
standby how often it should send the heartbeat?

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#5Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#4)
Re: Replication server timeout patch

On Fri, Feb 11, 2011 at 4:30 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 11.02.2011 22:11, Robert Haas wrote:

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina<drfarina@acm.org>  wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

Hmm, so this patch implements a watchdog, where the master disconnects the
standby if the heartbeat from the standby stops for more than
'replication_[server]_timeout' seconds. The standby sends the heartbeat
every wal_receiver_status_interval seconds.

It would be nice if the master and standby could negotiate those settings.
As the patch stands, it's easy to have a pathological configuration where
replication_server_timeout < wal_receiver_status_interval, so that the
master repeatedly disconnects the standby because it doesn't reply in time.
Maybe the standby should report how often it's going to send a heartbeat,
and master should wait for that long + some safety margin. Or maybe the
master should tell the standby how often it should send the heartbeat?

I guess the biggest use case for that behavior would be in a case
where you have two standbys, one of which doesn't send a heartbeat and
the other of which does. Then you really can't rely on a single
timeout.

Maybe we could change the server parameter to indicate what multiple
of wal_receiver_status_interval causes a hangup, and then change the
client to notify the server what value it's using. But that gets
complicated, because the value could be changed while the standby is
running.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#6Daniel Farina
daniel@heroku.com
In reply to: Robert Haas (#3)
1 attachment(s)
Re: Replication server timeout patch

On Fri, Feb 11, 2011 at 12:11 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina <drfarina@acm.org> wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

However, it looks to me like this renders wal_sender_delay aka
WalSndDelay completely unused.  If we don't need that GUC any more, we
should rip it out completely.

Indeed; I have cleaned this up.

The comment in WalSndHandshake should have a tab at the beginning of
every line.  Right now the first line has a tab and the rest have
spaces.

Also correct. Done.

The first hunk in WalSndLoop is a meaningless whitespace change.

I was trying to get it under 80 columns wide, but yes, it is unnecessary.

I think this closes out the small fry.

I have rebased my splitorific branch to reflect these changes:

https://github.com/fdr/postgres/commits/splitorific

Context diff equivalent attached.

--
fdr

Attachments:

0001-Split-and-rename-out-server-timeout-of-clients-2.patchapplication/octet-stream; name=0001-Split-and-rename-out-server-timeout-of-clients-2.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2121,2126 **** SET ENABLE_SEQSCAN TO OFF;
--- 2121,2140 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-replication-timeout-server" xreflabel="replication_timeout_server">
+       <term><varname>replication_timeout_server</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout_server</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         If the primary server does not receive a reply from a standby server
+         within <varname>replication_timeout_server</> seconds then the
+         primary will terminate the replication connection.
+        </para>
+       </listitem>
+      </varlistentry>
+ 
       </variablelist>
      </sect2>
     </sect1>
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 73,78 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 73,80 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 200;	/* max sleep time between some actions */
+ int			replication_timeout_server; /* If the receiver takes too long, time
+ 										 * out and die after this duration */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 89,94 **** static uint32 sendOff = 0;
--- 91,99 ----
   */
  static XLogRecPtr sentPtr = {0, 0};
  
+ /* Remembers the last time the standby has notified the primary of progress */
+ static TimestampTz last_reply_timestamp;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 250,255 **** WalSndHandshake(void)
--- 255,265 ----
  						 errmsg("invalid standby handshake message type %d", firstchar)));
  		}
  	}
+ 
+ 	/*
+ 	 * Initialize our timeout checking mechanism.
+ 	 */
+ 	last_reply_timestamp = GetCurrentTimestamp();
  }
  
  /*
***************
*** 618,632 **** WalSndLoop(void)
  				break;
  			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
! 				/*
! 				 * XXX: We don't really need the periodic wakeups anymore,
! 				 * WaitLatchOrSocket should reliably wake up as soon as
! 				 * something interesting happens.
! 				 */
  
  				/* Sleep */
! 				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
--- 628,649 ----
  				break;
  			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
! 				long timeout;
! 
! 				if (replication_timeout_server == -1)
! 					timeout = -1L;
! 				else
! 					timeout = 1000000L * replication_timeout_server;
  
  				/* Sleep */
! 				if (WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 									  timeout) == 0)
! 				{
! 					ereport(LOG,
! 							(errmsg("streaming replication timeout after %d s",
! 									replication_timeout_server)));
! 					break;
! 				}
  			}
  		}
  		else
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1484,1489 **** static struct config_int ConfigureNamesInt[] =
--- 1484,1499 ----
  	},
  
  	{
+ 		{"replication_timeout_server", PGC_SIGHUP, WAL_SETTINGS,
+ 		 gettext_noop("Replication connection will timeout after this duration."),
+ 		 NULL,
+ 		 GUC_UNIT_S
+ 		},
+ 		&replication_timeout_server,
+ 		30, -1, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
  			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
  			NULL,
***************
*** 1828,1843 **** static struct config_int ConfigureNamesInt[] =
  	},
  
  	{
- 		{"wal_sender_delay", PGC_SIGHUP, WAL_REPLICATION,
- 			gettext_noop("WAL sender sleep time between WAL replications."),
- 			NULL,
- 			GUC_UNIT_MS
- 		},
- 		&WalSndDelay,
- 		200, 1, 10000, NULL, NULL
- 	},
- 
- 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
--- 1838,1843 ----
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 203,208 ****
--- 203,209 ----
  					# when reading streaming WAL;
  					# -1 allows indefinite delay
  #wal_receiver_status_interval = 10s	# replies at least this often, 0 disables
+ #replication_timeout_server = 120	# -1 means wait forever
  
  
  #------------------------------------------------------------------------------
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 68,75 **** extern volatile sig_atomic_t walsender_shutdown_requested;
  extern volatile sig_atomic_t walsender_ready_to_stop;
  
  /* user-settable parameters */
- extern int	WalSndDelay;
  extern int	max_wal_senders;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
--- 68,75 ----
  extern volatile sig_atomic_t walsender_ready_to_stop;
  
  /* user-settable parameters */
  extern int	max_wal_senders;
+ extern int	replication_timeout_server;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
#7Robert Haas
robertmhaas@gmail.com
In reply to: Robert Haas (#5)
Re: Replication server timeout patch

On Fri, Feb 11, 2011 at 4:38 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Fri, Feb 11, 2011 at 4:30 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 11.02.2011 22:11, Robert Haas wrote:

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina<drfarina@acm.org>  wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet or
anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

Hmm, so this patch implements a watchdog, where the master disconnects the
standby if the heartbeat from the standby stops for more than
'replication_[server]_timeout' seconds. The standby sends the heartbeat
every wal_receiver_status_interval seconds.

It would be nice if the master and standby could negotiate those settings.
As the patch stands, it's easy to have a pathological configuration where
replication_server_timeout < wal_receiver_status_interval, so that the
master repeatedly disconnects the standby because it doesn't reply in time.
Maybe the standby should report how often it's going to send a heartbeat,
and master should wait for that long + some safety margin. Or maybe the
master should tell the standby how often it should send the heartbeat?

I guess the biggest use case for that behavior would be in a case
where you have two standbys, one of which doesn't send a heartbeat and
the other of which does.  Then you really can't rely on a single
timeout.

Maybe we could change the server parameter to indicate what multiple
of wal_receiver_status_interval causes a hangup, and then change the
client to notify the server what value it's using.  But that gets
complicated, because the value could be changed while the standby is
running.

On reflection I'm deeply uncertain this is a good idea. It's pretty
hopeless to suppose that we can keep the user from choosing parameter
settings which will cause them problems, and there are certainly far
stupider things they could do then set replication_timeout <
wal_receiver_status_interval. They could, for example, set fsync=off
or work_mem=4GB or checkpoint_segments=3 (never mind that we ship that
last one out of the box). Any of those settings have the potential to
thoroughly destroy their system in one way or another, and there's not
a darn thing we can do about it. Setting up some kind of handshake
system based on a multiple of the wal_receiver_status_interval is
going to be complex, and it's not necessarily going to deliver the
behavior someone wants anyway. If someone has
wal_receiver_status_interval=10 on one system and =30 on another
system, does it therefore follow that the timeouts should also be
different by 3X? Perhaps, but it's non-obvious.

There are two things that I think are pretty clear. If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection. And also we need to make
sure that the replication_timeout can't kill off a connection that is
in the middle of streaming a large base backup. Maybe we should try
to get those two cases right and not worry about the rest. Dan, can
you check whether the base backup thing is a problem with this as
implemented?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#8Daniel Farina
daniel@heroku.com
In reply to: Robert Haas (#7)
Re: Replication server timeout patch

On Feb 11, 2011 8:20 PM, "Robert Haas" <robertmhaas@gmail.com> wrote:

On Fri, Feb 11, 2011 at 4:38 PM, Robert Haas <robertmhaas@gmail.com>

wrote:

On Fri, Feb 11, 2011 at 4:30 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 11.02.2011 22:11, Robert Haas wrote:

On Fri, Feb 11, 2011 at 2:02 PM, Daniel Farina<drfarina@acm.org>

wrote:

I split this out of the synchronous replication patch for independent
review. I'm dashing out the door, so I haven't put it on the CF yet

or

anything, but I just wanted to get it out there...I'll be around in
Not Too Long to finish any other details.

This looks like a useful and separately committable change.

Hmm, so this patch implements a watchdog, where the master disconnects

the

standby if the heartbeat from the standby stops for more than
'replication_[server]_timeout' seconds. The standby sends the heartbeat
every wal_receiver_status_interval seconds.

It would be nice if the master and standby could negotiate those

settings.

As the patch stands, it's easy to have a pathological configuration

where

replication_server_timeout < wal_receiver_status_interval, so that the
master repeatedly disconnects the standby because it doesn't reply in

time.

Maybe the standby should report how often it's going to send a

heartbeat,

and master should wait for that long + some safety margin. Or maybe the
master should tell the standby how often it should send the heartbeat?

I guess the biggest use case for that behavior would be in a case
where you have two standbys, one of which doesn't send a heartbeat and
the other of which does. Then you really can't rely on a single
timeout.

Maybe we could change the server parameter to indicate what multiple
of wal_receiver_status_interval causes a hangup, and then change the
client to notify the server what value it's using. But that gets
complicated, because the value could be changed while the standby is
running.

On reflection I'm deeply uncertain this is a good idea. It's pretty
hopeless to suppose that we can keep the user from choosing parameter
settings which will cause them problems, and there are certainly far
stupider things they could do then set replication_timeout <
wal_receiver_status_interval. They could, for example, set fsync=off
or work_mem=4GB or checkpoint_segments=3 (never mind that we ship that
last one out of the box). Any of those settings have the potential to
thoroughly destroy their system in one way or another, and there's not
a darn thing we can do about it. Setting up some kind of handshake
system based on a multiple of the wal_receiver_status_interval is
going to be complex, and it's not necessarily going to deliver the
behavior someone wants anyway. If someone has
wal_receiver_status_interval=10 on one system and =30 on another
system, does it therefore follow that the timeouts should also be
different by 3X? Perhaps, but it's non-obvious.

There are two things that I think are pretty clear. If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection. And also we need to make
sure that the replication_timeout can't kill off a connection that is
in the middle of streaming a large base backup. Maybe we should try
to get those two cases right and not worry about the rest. Dan, can
you check whether the base backup thing is a problem with this as
implemented?

Yes, I will have something to say come Saturday.

--
fdr

#9Fujii Masao
masao.fujii@gmail.com
In reply to: Daniel Farina (#6)
Re: Replication server timeout patch

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#10Daniel Farina
daniel@heroku.com
In reply to: Fujii Masao (#9)
Re: Replication server timeout patch

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

--
fdr

#11Simon Riggs
simon@2ndQuadrant.com
In reply to: Daniel Farina (#10)
Re: Replication server timeout patch

On Mon, 2011-02-14 at 14:13 -0800, Daniel Farina wrote:

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

I wasn't aware that had been raised before. Thanks for noting it again.

I guess that's why you thought "wait forever" was a good idea ;-)

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

I'd like to see what you come up with. I would rate that as important,
though not essential for sync replication.

--
Simon Riggs http://www.2ndQuadrant.com/books/
PostgreSQL Development, 24x7 Support, Training and Services

#12Robert Haas
robertmhaas@gmail.com
In reply to: Daniel Farina (#10)
Re: Replication server timeout patch

On Mon, Feb 14, 2011 at 5:13 PM, Daniel Farina <daniel@heroku.com> wrote:

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

Are you working on this?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#13Fujii Masao
masao.fujii@gmail.com
In reply to: Daniel Farina (#10)
Re: Replication server timeout patch

On Tue, Feb 15, 2011 at 7:13 AM, Daniel Farina <daniel@heroku.com> wrote:

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

I'm not sure if there's already enough infrastructure for a non-blocking
write. But the patch which I submitted before might help to implement that.
http://archives.postgresql.org/message-id/AANLkTinSvcdAYryNfZqd0wepyh1Pf7YX6Q0KxhZjas6a%40mail.gmail.com

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#14Simon Riggs
simon@2ndQuadrant.com
In reply to: Fujii Masao (#13)
Re: Replication server timeout patch

On Wed, 2011-02-16 at 11:34 +0900, Fujii Masao wrote:

On Tue, Feb 15, 2011 at 7:13 AM, Daniel Farina <daniel@heroku.com> wrote:

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

I'm not sure if there's already enough infrastructure for a non-blocking
write. But the patch which I submitted before might help to implement that.
http://archives.postgresql.org/message-id/AANLkTinSvcdAYryNfZqd0wepyh1Pf7YX6Q0KxhZjas6a%40mail.gmail.com

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

I will be looking to commit this tomorrow morning, unless I hear some
clear No comments, with reasons.

--
Simon Riggs http://www.2ndQuadrant.com/books/
PostgreSQL Development, 24x7 Support, Training and Services

#15Robert Haas
robertmhaas@gmail.com
In reply to: Simon Riggs (#14)
Re: Replication server timeout patch

On Thu, Feb 17, 2011 at 4:21 PM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Wed, 2011-02-16 at 11:34 +0900, Fujii Masao wrote:

On Tue, Feb 15, 2011 at 7:13 AM, Daniel Farina <daniel@heroku.com> wrote:

On Mon, Feb 14, 2011 at 12:48 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sat, Feb 12, 2011 at 8:58 AM, Daniel Farina <daniel@heroku.com> wrote:

Context diff equivalent attached.

Thanks for the patch!

As I said before, the timeout which this patch provides doesn't work well
when the walsender gets blocked in sending WAL. At first, we would
need to implement a non-blocking write function as an infrastructure
of the replication timeout, I think.
http://archives.postgresql.org/message-id/AANLkTi%3DPu2ne%3DVO-%2BCLMXLQh9y85qumLCbBP15CjnyUS%40mail.gmail.com

Interesting point...if that's accepted as required-for-commit, what
are the perceptions of the odds that, presuming I can write the code
quickly enough, that there's enough infrastructure/ports already in
postgres to allow for a non-blocking write on all our supported
platforms?

I'm not sure if there's already enough infrastructure for a non-blocking
write. But the patch which I submitted before might help to implement that.
http://archives.postgresql.org/message-id/AANLkTinSvcdAYryNfZqd0wepyh1Pf7YX6Q0KxhZjas6a%40mail.gmail.com

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

I will be looking to commit this tomorrow morning, unless I hear some
clear No comments, with reasons.

I guess the question is whether it works in 10% of cases or 95% of
cases. In the first case there's probably no point in pretending we
have a feature if it doesn't really work. In the second case, it
might make sense. But I don't have a good feeling for which it is.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#16Josh Berkus
josh@agliodbs.com
In reply to: Simon Riggs (#14)
Re: Replication server timeout patch

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

Can someone summarize the cases where it does and doesn't work?
There's been a longish gap in this thread.

--
-- Josh Berkus
PostgreSQL Experts Inc.
http://www.pgexperts.com

#17Simon Riggs
simon@2ndQuadrant.com
In reply to: Robert Haas (#15)
Re: Replication server timeout patch

On Thu, 2011-02-17 at 16:42 -0500, Robert Haas wrote:

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

I will be looking to commit this tomorrow morning, unless I hear some
clear No comments, with reasons.

I guess the question is whether it works in 10% of cases or 95% of
cases. In the first case there's probably no point in pretending we
have a feature if it doesn't really work. In the second case, it
might make sense. But I don't have a good feeling for which it is.

Well, I guess the people that wanted to wait forever may get their wish.

For sync rep, I intend to put in place a client timeout, which we do
have code for. The server side timeout still makes sense, but it's not a
requirement for sync rep.

--
Simon Riggs http://www.2ndQuadrant.com/books/
PostgreSQL Development, 24x7 Support, Training and Services

#18Fujii Masao
masao.fujii@gmail.com
In reply to: Josh Berkus (#16)
Re: Replication server timeout patch

On Fri, Feb 18, 2011 at 7:55 AM, Josh Berkus <josh@agliodbs.com> wrote:

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

Can someone summarize the cases where it does and doesn't work?
There's been a longish gap in this thread.

The timeout doesn't work when walsender gets blocked during sending the
WAL because the send buffer has been filled up, I'm afraid. IOW, it doesn't
work when the standby becomes unresponsive while WAL is generated on
the master one after another. Since walsender tries to continue sending the
WAL while the standby is unresponsive, the send buffer gets filled up and
the blocking send function (e.g., pq_flush) blocks the walsender.

OTOH, if the standby becomes unresponsive when there is no workload
which causes WAL, the timeout would work.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#19Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#18)
Re: Replication server timeout patch

On Thu, Feb 17, 2011 at 9:10 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Fri, Feb 18, 2011 at 7:55 AM, Josh Berkus <josh@agliodbs.com> wrote:

So, in summary, the position is that we have a timeout, but that timeout
doesn't work in all cases. But it does work in some, so that seems
enough for me to say "let's commit". Not committing gives us nothing at
all, which is as much use as a chocolate teapot.

Can someone summarize the cases where it does and doesn't work?
There's been a longish gap in this thread.

The timeout doesn't work when walsender gets blocked during sending the
WAL because the send buffer has been filled up, I'm afraid. IOW, it doesn't
work when the standby becomes unresponsive while WAL is generated on
the master one after another. Since walsender tries to continue sending the
WAL while the standby is unresponsive, the send buffer gets filled up and
the blocking send function (e.g., pq_flush) blocks the walsender.

OTOH, if the standby becomes unresponsive when there is no workload
which causes WAL, the timeout would work.

IMHO, that's so broken as to be useless.

I would really like to have a solution to this problem, though.
Relying on TCP keepalives is weak.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#20Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#19)
1 attachment(s)
Re: Replication server timeout patch

On Fri, Feb 18, 2011 at 12:10 PM, Robert Haas <robertmhaas@gmail.com> wrote:

IMHO, that's so broken as to be useless.

I would really like to have a solution to this problem, though.
Relying on TCP keepalives is weak.

Agreed.

I updated the replication timeout patch which I submitted before.
http://archives.postgresql.org/message-id/AANLkTinSvcdAYryNfZqd0wepyh1Pf7YX6Q0KxhZjas6a%40mail.gmail.com

Since the patch implements also non-blocking send functions,
the timeout can work properly even when the send buffer has
been filled up.

There are two things that I think are pretty clear. If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection.

The patch still doesn't check that wal_receiver_status_interval
is set up properly. I'll implement that later.

Regards,

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_timeout_v2.patchtext/x-patch; charset=US-ASCII; name=replication_timeout_v2.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2015,2020 **** SET ENABLE_SEQSCAN TO OFF;
--- 2015,2042 ----
         </para>
        </listitem>
       </varlistentry>
+ 
+      <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+       <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the maximum time, in milliseconds, to wait for the reply
+         from the standby before terminating replication.  This is useful for
+         the primary server to detect the standby crash or network outage.
+         A value of zero (the default) turns this off.  This parameter can
+         only be set in the <filename>postgresql.conf</> file or on the server
+         command line.
+        </para>
+        <para>
+         To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+         must be enabled on the standby, and its value must be less than the
+         value of <varname>replication_timeout</>.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
  
***************
*** 2125,2130 **** SET ENABLE_SEQSCAN TO OFF;
--- 2147,2157 ----
         the <filename>postgresql.conf</> file or on the server command line.
         The default value is 10 seconds.
        </para>
+       <para>
+        When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+        <varname>wal_receiver_status_interval</> must be enabled, and its value
+        must be less than the value of <varname>replication_timeout</>.
+       </para>
        </listitem>
       </varlistentry>
  
*** a/src/backend/libpq/pqcomm.c
--- b/src/backend/libpq/pqcomm.c
***************
*** 56,61 ****
--- 56,63 ----
   *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
   *		pq_flush		- flush pending output
   *		pq_getbyte_if_available - get a byte if available without blocking
+  *		pq_putbytes_if_writable	- send bytes to connection if writable without blocking
+  *		pq_flush_if_writable	- flush pending output if writable without blocking
   *
   * message-level I/O (and old-style-COPY-OUT cruft):
   *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
***************
*** 112,117 **** static char sock_path[MAXPGPATH];
--- 114,120 ----
  
  static char PqSendBuffer[PQ_BUFFER_SIZE];
  static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+ static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
  
  static char PqRecvBuffer[PQ_BUFFER_SIZE];
  static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
***************
*** 128,133 **** static bool DoingCopyOut;
--- 131,137 ----
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
  static int	internal_flush(void);
+ static int	internal_flush_if_writable(void);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
***************
*** 1153,1158 **** internal_putbytes(const char *s, size_t len)
--- 1157,1212 ----
  }
  
  /* --------------------------------
+  *		pq_putbytes_if_writable - send bytes to connection (not flushed
+  *			until pq_flush), if writable
+  *
+  * Returns the number of bytes written without blocking, or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_putbytes_if_writable(const char *s, size_t len)
+ {
+ 	size_t		amount;
+ 	size_t		nwritten = 0;
+ 
+ 	/* Should not be called by old-style COPY OUT */
+ 	Assert(!DoingCopyOut);
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 
+ 	while (len > 0)
+ 	{
+ 		/* If buffer is full, then flush it out */
+ 		if (PqSendPointer >= PQ_BUFFER_SIZE)
+ 		{
+ 			int		r;
+ 
+ 			r = internal_flush_if_writable();
+ 			if (r == 0)
+ 				break;
+ 			if (r == EOF)
+ 			{
+ 				PqCommBusy = false;
+ 				return r;
+ 			}
+ 		}
+ 		amount = PQ_BUFFER_SIZE - PqSendPointer;
+ 		if (amount > len)
+ 			amount = len;
+ 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
+ 		PqSendPointer += amount;
+ 		s += amount;
+ 		len -= amount;
+ 		nwritten += amount;
+ 	}
+ 
+ 	PqCommBusy = false;
+ 	return (int) nwritten;
+ }
+ 
+ /* --------------------------------
   *		pq_flush		- flush pending output
   *
   *		returns 0 if OK, EOF if trouble
***************
*** 1224,1229 **** internal_flush(void)
--- 1278,1411 ----
  	return 0;
  }
  
+ /* --------------------------------
+  *		pq_flush_if_writable - flush pending output if writable
+  *
+  * Returns 1 if OK, 0 if pending output cannot be written without blocking,
+  * or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_flush_if_writable(void)
+ {
+ 	int			res;
+ 
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 	res = internal_flush_if_writable();
+ 	PqCommBusy = false;
+ 	return res;
+ }
+ 
+ int
+ internal_flush_if_writable(void)
+ {
+ 	static int	last_reported_send_errno = 0;
+ 
+ 	char	   *bufptr = PqSendBuffer + PqSendStart;
+ 	char	   *bufend = PqSendBuffer + PqSendPointer;
+ 
+ 	while (bufptr < bufend)
+ 	{
+ 		int			r;
+ 
+ 		/* Temporarily put the socket into non-blocking mode */
+ #ifdef WIN32
+ 		pgwin32_noblock = 1;
+ #else
+ 		if (!pg_set_noblock(MyProcPort->sock))
+ 			ereport(ERROR,
+ 					(errmsg("could not set socket to non-blocking mode: %m")));
+ #endif
+ 		MyProcPort->noblock = true;
+ 		PG_TRY();
+ 		{
+ 			r = secure_write(MyProcPort, bufptr, bufend - bufptr);
+ 
+ 			if (r < 0)
+ 			{
+ 				/*
+ 				 * Ok if no data writable without blocking or interrupted (though
+ 				 * EINTR really shouldn't happen with a non-blocking socket).
+ 				 * Report other errors.
+ 				 */
+ 				if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
+ 					r = 0;
+ 				else
+ 				{
+ 					if (errno != last_reported_send_errno)
+ 					{
+ 						/*
+ 						 * Careful: an ereport() that tries to write to the
+ 						 * client would cause recursion to here, leading to
+ 						 * stack overflow and core dump!  This message must
+ 						 * go *only* to the postmaster log.
+ 						 *
+ 						 * If a client disconnects while we're in the midst
+ 						 * of output, we might write quite a bit of data before
+ 						 * we get to a safe query abort point.  So, suppress
+ 						 * duplicate log messages.
+ 						 */
+ 						last_reported_send_errno = errno;
+ 						ereport(COMMERROR,
+ 								(errcode_for_socket_access(),
+ 								 errmsg("could not send data to client: %m")));
+ 					}
+ 
+ 					/*
+ 					 * We drop the buffered data anyway so that processing can
+ 					 * continue, even though we'll probably quit soon.
+ 					 */
+ 					PqSendStart = PqSendPointer = 0;
+ 					r = EOF;
+ 				}
+ 			}
+ 			else if (r == 0)
+ 			{
+ 				/* EOF detected */
+ 				r = EOF;
+ 			}
+ 		}
+ 		PG_CATCH();
+ 		{
+ 			/*
+ 			 * The rest of the backend code assumes the socket is in blocking
+ 			 * mode, so treat failure as FATAL.
+ 			 */
+ #ifdef WIN32
+ 			pgwin32_noblock = 0;
+ #else
+ 			if (!pg_set_block(MyProcPort->sock))
+ 				ereport(FATAL,
+ 						(errmsg("could not set socket to blocking mode: %m")));
+ #endif
+ 			MyProcPort->noblock = false;
+ 			PG_RE_THROW();
+ 		}
+ 		PG_END_TRY();
+ #ifdef WIN32
+ 		pgwin32_noblock = 0;
+ #else
+ 		if (!pg_set_block(MyProcPort->sock))
+ 			ereport(FATAL,
+ 					(errmsg("could not set socket to blocking mode: %m")));
+ #endif
+ 		MyProcPort->noblock = false;
+ 
+ 		if (r == 0 || r == EOF)
+ 			return r;
+ 
+ 		last_reported_send_errno = 0;	/* reset after any successful send */
+ 		bufptr += r;
+ 		PqSendStart += r;
+ 	}
+ 
+ 	PqSendStart = PqSendPointer = 0;
+ 	return 1;
+ }
+ 
  
  /* --------------------------------
   * Message-level I/O routines begin here.
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 193,211 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
!  * was set, or 2 if the scoket became readable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
  	int			rc;
  	int			result = 0;
  
--- 193,214 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the scoket became readable, or 3 if
!  * the socket became writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
+ 	fd_set		output_mask;
  	int			rc;
  	int			result = 0;
  
***************
*** 241,254 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 244,265 ----
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
! 		{
! 			FD_SET(sock, &output_mask);
! 			if (sock > hifd)
! 				hifd = sock;
! 		}
! 
! 		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 263,273 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
  	}
  	waiting = false;
  
--- 274,291 ----
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && forRead &&
! 			FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
+ 		if (sock != PGINVALID_SOCKET && forWrite &&
+ 			FD_ISSET(sock, &output_mask))
+ 		{
+ 			result = 3;
+ 			break;		/* data writable in socket */
+ 		}
  	}
  	waiting = false;
  
*** a/src/backend/port/win32/socket.c
--- b/src/backend/port/win32/socket.c
***************
*** 14,20 ****
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() should operate in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
--- 14,21 ----
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() and pgwin32_send() should operate
!  * in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
***************
*** 399,404 **** pgwin32_send(SOCKET s, char *buf, int len, int flags)
--- 400,415 ----
  			return -1;
  		}
  
+ 		if (pgwin32_noblock)
+ 		{
+ 			/*
+ 			 * No data sent, and we are in "emulated non-blocking mode", so
+ 			 * return indicating that we'd block if we were to continue.
+ 			 */
+ 			errno = EWOULDBLOCK;
+ 			return -1;
+ 		}
+ 
  		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
  
  		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 85,95 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
--- 85,96 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
***************
*** 103,112 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET)
  	{
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, FD_READ);
  		events[numevents++] = sockevent;
  	}
  
--- 104,120 ----
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
+ 		int		flags = 0;
+ 
+ 		if (forRead)
+ 			flags |= FD_READ;
+ 		if (forWrite)
+ 			flags |= FD_WRITE;
+ 
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
***************
*** 139,146 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			Assert(sock != PGINVALID_SOCKET);
! 			result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
--- 147,165 ----
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
+ 			WSANETWORKEVENTS resEvents;
+ 
  			Assert(sock != PGINVALID_SOCKET);
! 
! 			ZeroMemory(&resEvents, sizeof(resEvents));
! 			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
! 				ereport(FATAL,
! 						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
! 
! 			if (forRead && resEvents.lNetworkEvents & FD_READ)
! 				result = 2;
! 			if (forWrite && resEvents.lNetworkEvents & FD_WRITE)
! 				result = 3;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
***************
*** 148,154 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	}
  
  	/* Clean up the handle we created for the socket */
! 		if (sock != PGINVALID_SOCKET)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 167,173 ----
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 74,79 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 74,91 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 1000;	/* max sleep time between some actions */
+ int			replication_timeout = 0;	/* maximum time to send one WAL data message */
+ 
+ /*
+  * Buffer for WAL sending
+  *
+  * WalSndOutBuffer is a work area in which the output message is constructed.
+  * It's used in just so we can avoid re-palloc'ing the buffer on each cycle.
+  * It must be of size 6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
+  */
+ static char	   *WalSndOutBuffer;
+ static int		WalSndOutHead;		/* head of pending output */
+ static int		WalSndOutTail;		/* tail of pending output */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 95,100 **** static XLogRecPtr sentPtr = {0, 0};
--- 107,117 ----
   */
  static StringInfoData reply_message;
  
+ /*
+  * Timestamp of the last receipt of the reply from the standby.
+  */
+ static TimestampTz last_reply_timestamp;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 113,119 **** static int	WalSndLoop(void);
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(char *msgbuf, bool *caughtup);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
--- 130,136 ----
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(bool *caughtup, bool *pending);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
***************
*** 467,472 **** ProcessRepliesIfAny(void)
--- 484,490 ----
  {
  	unsigned char firstchar;
  	int			r;
+ 	int		received = false;
  
  	for (;;)
  	{
***************
*** 479,487 **** ProcessRepliesIfAny(void)
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)
  		{
! 			/* no data available without blocking */
  			return;
  		}
  
--- 497,510 ----
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)	/* no data available without blocking */
  		{
! 			/*
! 			 * Save the last reply timestamp if we've received at least
! 			 * one reply.
! 			 */
! 			if (received)
! 				last_reply_timestamp = GetCurrentTimestamp();
  			return;
  		}
  
***************
*** 493,498 **** ProcessRepliesIfAny(void)
--- 516,522 ----
  				 */
  			case 'd':
  				ProcessStandbyMessage();
+ 				received = true;
  				break;
  
  				/*
***************
*** 669,683 **** ProcessStandbyHSFeedbackMessage(void)
  static int
  WalSndLoop(void)
  {
- 	char	   *output_message;
  	bool		caughtup = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
--- 693,708 ----
  static int
  WalSndLoop(void)
  {
  	bool		caughtup = false;
+ 	bool		pending = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	WalSndOutBuffer = palloc(6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
! 	WalSndOutHead = WalSndOutTail = 0;
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
***************
*** 685,690 **** WalSndLoop(void)
--- 710,718 ----
  	 */
  	initStringInfo(&reply_message);
  
+ 	/* Initialize the last reply timestamp */
+ 	last_reply_timestamp = GetCurrentTimestamp();
+ 
  	/* Loop forever, unless we get an error */
  	for (;;)
  	{
***************
*** 708,717 **** WalSndLoop(void)
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup)
  				walsender_shutdown_requested = true;
  		}
  
--- 736,745 ----
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup && !pending)
  				walsender_shutdown_requested = true;
  		}
  
***************
*** 726,735 **** WalSndLoop(void)
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round, nap for the
! 		 * configured time before retrying.
  		 */
! 		if (caughtup)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
--- 754,764 ----
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round or could not
! 		 * flush pending WAL in output buffer because the socket was not
! 		 * writable, nap for the configured time before retrying.
  		 */
! 		if (caughtup || pending)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
***************
*** 740,764 **** WalSndLoop(void)
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  		}
  
--- 769,828 ----
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(&caughtup, &pending))
  				break;
! 			if ((caughtup || pending) && !got_SIGHUP && !walsender_ready_to_stop &&
! 					!walsender_shutdown_requested)
  			{
+ 				TimestampTz	finish_time;
+ 				long		sleeptime;
+ 
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
+ 				/* Reschedule replication timeout */
+ 				if (replication_timeout > 0)
+ 				{
+ 					long		secs;
+ 					int		usecs;
+ 
+ 					finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+ 											replication_timeout);
+ 					TimestampDifference(GetCurrentTimestamp(),
+ 								finish_time, &secs, &usecs);
+ 					sleeptime = secs * 1000 + usecs / 1000;
+ 					if (WalSndDelay < sleeptime)
+ 						sleeptime = WalSndDelay;
+ 				}
+ 				else
+ 					sleeptime = WalSndDelay;
+ 
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  true, (WalSndOutTail > 0),
! 								  sleeptime * 1000L);
! 
! 				/* Check for replication timeout */
! 				if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time)
! 				{
! 					/*
! 					 * Since typically expiration of replication timeout means
! 					 * communication problem, we don't send the error message
! 					 * to the standby.
! 					 */
! 					ereport(COMMERROR,
! 							(errmsg("terminating walsender process due to replication timeout")));
! 					break;
! 				}
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  		}
  
***************
*** 986,1009 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
-  * msgbuf is a work area in which the output message is constructed.  It's
-  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
-  * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
-  *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(char *msgbuf, bool *caughtup)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	XLogRecPtr	endptr;
  	Size		nbytes;
  	WalDataMessageHeader msghdr;
  
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
--- 1050,1097 ----
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
+  * If there is pending WAL in output buffer, *pending is set to true,
+  * otherwise *pending is set to false.
+  *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(bool *caughtup, bool *pending)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	static XLogRecPtr	endptr;
  	Size		nbytes;
+ 	uint32		n32;
+ 	int			res;
  	WalDataMessageHeader msghdr;
  
+ 	/* Attempt to flush pending WAL in output buffer */
+ 	if (*pending)
+ 	{
+ 		if (WalSndOutHead != WalSndOutTail)
+ 		{
+ 			res = pq_putbytes_if_writable(WalSndOutBuffer + WalSndOutHead,
+ 										  WalSndOutTail - WalSndOutHead);
+ 			if (res == EOF)
+ 				return false;
+ 			WalSndOutHead += res;
+ 			if (WalSndOutHead != WalSndOutTail)
+ 				return true;
+ 		}
+ 
+ 		res = pq_flush_if_writable();
+ 		if (res == EOF)
+ 			return false;
+ 		if (res == 0)
+ 			return true;
+ 
+ 		goto updt;
+ 	}
+ 
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
***************
*** 1072,1084 **** XLogSend(char *msgbuf, bool *caughtup)
  	/*
  	 * OK to read and send the slice.
  	 */
! 	msgbuf[0] = 'w';
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
--- 1160,1178 ----
  	/*
  	 * OK to read and send the slice.
  	 */
! 	WalSndOutBuffer[0] = 'd';
! 	WalSndOutBuffer[5] = 'w';
! 	WalSndOutHead = 0;
! 	WalSndOutTail = 6 + sizeof(WalDataMessageHeader) + nbytes;
! 
! 	n32 = htonl((uint32) WalSndOutTail - 1);
! 	memcpy(WalSndOutBuffer + 1, &n32, 4);
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(WalSndOutBuffer + 6 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
***************
*** 1088,1100 **** XLogSend(char *msgbuf, bool *caughtup)
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
  
! 	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
  
  	/* Flush pending output to the client */
! 	if (pq_flush())
  		return false;
  
  	sentPtr = endptr;
  
--- 1182,1215 ----
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(WalSndOutBuffer + 6, &msghdr, sizeof(WalDataMessageHeader));
  
! 	res = pq_putbytes_if_writable(WalSndOutBuffer, WalSndOutTail);
! 	if (res == EOF)
! 		return false;
! 
! 	WalSndOutHead = res;
! 	if (WalSndOutHead != WalSndOutTail)
! 	{
! 		*caughtup = false;
! 		*pending = true;
! 		return true;
! 	}
  
  	/* Flush pending output to the client */
! 	res = pq_flush_if_writable();
! 	if (res == EOF)
  		return false;
+ 	if (res == 0)
+ 	{
+ 		*caughtup = false;
+ 		*pending = true;
+ 		return true;
+ 	}
+ 
+ updt:
+ 	WalSndOutHead = WalSndOutTail = 0;
+ 	*pending = false;
  
  	sentPtr = endptr;
  
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1847,1852 **** static struct config_int ConfigureNamesInt[] =
--- 1847,1862 ----
  	},
  
  	{
+ 		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum time to wait for WAL replication."),
+ 			NULL,
+ 			GUC_UNIT_MS
+ 		},
+ 		&replication_timeout,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 191,196 ****
--- 191,197 ----
  #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
  #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+ #replication_timeout = 0 # in milliseconds, 0 is disabled
  
  # - Standby Servers -
  
*** a/src/include/libpq/libpq.h
--- b/src/include/libpq/libpq.h
***************
*** 59,65 **** extern int	pq_getbyte(void);
--- 59,67 ----
  extern int	pq_peekbyte(void);
  extern int	pq_getbyte_if_available(unsigned char *c);
  extern int	pq_putbytes(const char *s, size_t len);
+ extern int	pq_putbytes_if_writable(const char *s, size_t len);
  extern int	pq_flush(void);
+ extern int	pq_flush_if_writable(void);
  extern int	pq_putmessage(char msgtype, const char *s, size_t len);
  extern void pq_startcopyout(void);
  extern void pq_endcopyout(bool errorAbort);
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 70,75 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 70,76 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 40,46 **** extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
--- 40,46 ----
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#21Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#20)
1 attachment(s)
Re: Replication server timeout patch

On Sun, Feb 27, 2011 at 11:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

There are two things that I think are pretty clear.  If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection.

The patch still doesn't check that wal_receiver_status_interval
is set up properly. I'll implement that later.

Done. I attached the updated patch.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_timeout_v3.patchapplication/octet-stream; name=replication_timeout_v3.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2015,2020 **** SET ENABLE_SEQSCAN TO OFF;
--- 2015,2044 ----
         </para>
        </listitem>
       </varlistentry>
+ 
+      <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+       <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the maximum time, in milliseconds, to wait for the reply
+         from the standby before terminating replication.  This is useful for
+         the primary server to detect the standby crash or network outage.
+         A value of zero (the default) turns this off.  This parameter can
+         only be set in the <filename>postgresql.conf</> file or on the server
+         command line.
+        </para>
+        <para>
+         To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+         must be enabled on the standby, and its value must be less than the
+         value of <varname>replication_timeout</>.
+         If <varname>wal_receiver_status_interval</> is zero on the standby,
+         replication timeout is disabled for that connection.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
  
***************
*** 2125,2130 **** SET ENABLE_SEQSCAN TO OFF;
--- 2149,2159 ----
         the <filename>postgresql.conf</> file or on the server command line.
         The default value is 10 seconds.
        </para>
+       <para>
+        When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+        <varname>wal_receiver_status_interval</> must be enabled, and its value
+        must be less than the value of <varname>replication_timeout</>.
+       </para>
        </listitem>
       </varlistentry>
  
*** a/doc/src/sgml/protocol.sgml
--- b/doc/src/sgml/protocol.sgml
***************
*** 1480,1485 **** The commands accepted in walsender mode are:
--- 1480,1533 ----
        <variablelist>
        <varlistentry>
        <term>
+           Standby parameter change (F)
+       </term>
+       <listitem>
+       <para>
+       <variablelist>
+       <varlistentry>
+       <term>
+           Byte1('g')
+       </term>
+       <listitem>
+       <para>
+           Identifies the message as a receiver parameter change.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Int32
+       </term>
+       <listitem>
+       <para>
+           The current value of wal_receiver_status_interval
+           in the standby.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Byte8
+       </term>
+       <listitem>
+       <para>
+           The server's system clock at the time of transmission,
+           given in TimestampTz format.
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+      </para>
+ 
+      <para>
+       <variablelist>
+       <varlistentry>
+       <term>
            Standby status update (F)
        </term>
        <listitem>
*** a/src/backend/libpq/pqcomm.c
--- b/src/backend/libpq/pqcomm.c
***************
*** 56,61 ****
--- 56,63 ----
   *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
   *		pq_flush		- flush pending output
   *		pq_getbyte_if_available - get a byte if available without blocking
+  *		pq_putbytes_if_writable	- send bytes to connection if writable without blocking
+  *		pq_flush_if_writable	- flush pending output if writable without blocking
   *
   * message-level I/O (and old-style-COPY-OUT cruft):
   *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
***************
*** 112,117 **** static char sock_path[MAXPGPATH];
--- 114,120 ----
  
  static char PqSendBuffer[PQ_BUFFER_SIZE];
  static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+ static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
  
  static char PqRecvBuffer[PQ_BUFFER_SIZE];
  static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
***************
*** 128,133 **** static bool DoingCopyOut;
--- 131,137 ----
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
  static int	internal_flush(void);
+ static int	internal_flush_if_writable(void);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
***************
*** 1153,1158 **** internal_putbytes(const char *s, size_t len)
--- 1157,1212 ----
  }
  
  /* --------------------------------
+  *		pq_putbytes_if_writable - send bytes to connection (not flushed
+  *			until pq_flush), if writable
+  *
+  * Returns the number of bytes written without blocking, or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_putbytes_if_writable(const char *s, size_t len)
+ {
+ 	size_t		amount;
+ 	size_t		nwritten = 0;
+ 
+ 	/* Should not be called by old-style COPY OUT */
+ 	Assert(!DoingCopyOut);
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 
+ 	while (len > 0)
+ 	{
+ 		/* If buffer is full, then flush it out */
+ 		if (PqSendPointer >= PQ_BUFFER_SIZE)
+ 		{
+ 			int		r;
+ 
+ 			r = internal_flush_if_writable();
+ 			if (r == 0)
+ 				break;
+ 			if (r == EOF)
+ 			{
+ 				PqCommBusy = false;
+ 				return r;
+ 			}
+ 		}
+ 		amount = PQ_BUFFER_SIZE - PqSendPointer;
+ 		if (amount > len)
+ 			amount = len;
+ 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
+ 		PqSendPointer += amount;
+ 		s += amount;
+ 		len -= amount;
+ 		nwritten += amount;
+ 	}
+ 
+ 	PqCommBusy = false;
+ 	return (int) nwritten;
+ }
+ 
+ /* --------------------------------
   *		pq_flush		- flush pending output
   *
   *		returns 0 if OK, EOF if trouble
***************
*** 1224,1229 **** internal_flush(void)
--- 1278,1411 ----
  	return 0;
  }
  
+ /* --------------------------------
+  *		pq_flush_if_writable - flush pending output if writable
+  *
+  * Returns 1 if OK, 0 if pending output cannot be written without blocking,
+  * or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_flush_if_writable(void)
+ {
+ 	int			res;
+ 
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 	res = internal_flush_if_writable();
+ 	PqCommBusy = false;
+ 	return res;
+ }
+ 
+ int
+ internal_flush_if_writable(void)
+ {
+ 	static int	last_reported_send_errno = 0;
+ 
+ 	char	   *bufptr = PqSendBuffer + PqSendStart;
+ 	char	   *bufend = PqSendBuffer + PqSendPointer;
+ 
+ 	while (bufptr < bufend)
+ 	{
+ 		int			r;
+ 
+ 		/* Temporarily put the socket into non-blocking mode */
+ #ifdef WIN32
+ 		pgwin32_noblock = 1;
+ #else
+ 		if (!pg_set_noblock(MyProcPort->sock))
+ 			ereport(ERROR,
+ 					(errmsg("could not set socket to non-blocking mode: %m")));
+ #endif
+ 		MyProcPort->noblock = true;
+ 		PG_TRY();
+ 		{
+ 			r = secure_write(MyProcPort, bufptr, bufend - bufptr);
+ 
+ 			if (r < 0)
+ 			{
+ 				/*
+ 				 * Ok if no data writable without blocking or interrupted (though
+ 				 * EINTR really shouldn't happen with a non-blocking socket).
+ 				 * Report other errors.
+ 				 */
+ 				if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
+ 					r = 0;
+ 				else
+ 				{
+ 					if (errno != last_reported_send_errno)
+ 					{
+ 						/*
+ 						 * Careful: an ereport() that tries to write to the
+ 						 * client would cause recursion to here, leading to
+ 						 * stack overflow and core dump!  This message must
+ 						 * go *only* to the postmaster log.
+ 						 *
+ 						 * If a client disconnects while we're in the midst
+ 						 * of output, we might write quite a bit of data before
+ 						 * we get to a safe query abort point.  So, suppress
+ 						 * duplicate log messages.
+ 						 */
+ 						last_reported_send_errno = errno;
+ 						ereport(COMMERROR,
+ 								(errcode_for_socket_access(),
+ 								 errmsg("could not send data to client: %m")));
+ 					}
+ 
+ 					/*
+ 					 * We drop the buffered data anyway so that processing can
+ 					 * continue, even though we'll probably quit soon.
+ 					 */
+ 					PqSendStart = PqSendPointer = 0;
+ 					r = EOF;
+ 				}
+ 			}
+ 			else if (r == 0)
+ 			{
+ 				/* EOF detected */
+ 				r = EOF;
+ 			}
+ 		}
+ 		PG_CATCH();
+ 		{
+ 			/*
+ 			 * The rest of the backend code assumes the socket is in blocking
+ 			 * mode, so treat failure as FATAL.
+ 			 */
+ #ifdef WIN32
+ 			pgwin32_noblock = 0;
+ #else
+ 			if (!pg_set_block(MyProcPort->sock))
+ 				ereport(FATAL,
+ 						(errmsg("could not set socket to blocking mode: %m")));
+ #endif
+ 			MyProcPort->noblock = false;
+ 			PG_RE_THROW();
+ 		}
+ 		PG_END_TRY();
+ #ifdef WIN32
+ 		pgwin32_noblock = 0;
+ #else
+ 		if (!pg_set_block(MyProcPort->sock))
+ 			ereport(FATAL,
+ 					(errmsg("could not set socket to blocking mode: %m")));
+ #endif
+ 		MyProcPort->noblock = false;
+ 
+ 		if (r == 0 || r == EOF)
+ 			return r;
+ 
+ 		last_reported_send_errno = 0;	/* reset after any successful send */
+ 		bufptr += r;
+ 		PqSendStart += r;
+ 	}
+ 
+ 	PqSendStart = PqSendPointer = 0;
+ 	return 1;
+ }
+ 
  
  /* --------------------------------
   * Message-level I/O routines begin here.
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 193,211 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
!  * was set, or 2 if the scoket became readable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
  	int			rc;
  	int			result = 0;
  
--- 193,214 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the scoket became readable, or 3 if
!  * the socket became writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
+ 	fd_set		output_mask;
  	int			rc;
  	int			result = 0;
  
***************
*** 241,254 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 244,265 ----
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
! 		{
! 			FD_SET(sock, &output_mask);
! 			if (sock > hifd)
! 				hifd = sock;
! 		}
! 
! 		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 263,273 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
  	}
  	waiting = false;
  
--- 274,291 ----
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && forRead &&
! 			FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
+ 		if (sock != PGINVALID_SOCKET && forWrite &&
+ 			FD_ISSET(sock, &output_mask))
+ 		{
+ 			result = 3;
+ 			break;		/* data writable in socket */
+ 		}
  	}
  	waiting = false;
  
*** a/src/backend/port/win32/socket.c
--- b/src/backend/port/win32/socket.c
***************
*** 14,20 ****
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() should operate in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
--- 14,21 ----
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() and pgwin32_send() should operate
!  * in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
***************
*** 399,404 **** pgwin32_send(SOCKET s, char *buf, int len, int flags)
--- 400,415 ----
  			return -1;
  		}
  
+ 		if (pgwin32_noblock)
+ 		{
+ 			/*
+ 			 * No data sent, and we are in "emulated non-blocking mode", so
+ 			 * return indicating that we'd block if we were to continue.
+ 			 */
+ 			errno = EWOULDBLOCK;
+ 			return -1;
+ 		}
+ 
  		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
  
  		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 85,95 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
--- 85,96 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
***************
*** 103,112 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET)
  	{
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, FD_READ);
  		events[numevents++] = sockevent;
  	}
  
--- 104,120 ----
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
+ 		int		flags = 0;
+ 
+ 		if (forRead)
+ 			flags |= FD_READ;
+ 		if (forWrite)
+ 			flags |= FD_WRITE;
+ 
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
***************
*** 139,146 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			Assert(sock != PGINVALID_SOCKET);
! 			result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
--- 147,165 ----
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
+ 			WSANETWORKEVENTS resEvents;
+ 
  			Assert(sock != PGINVALID_SOCKET);
! 
! 			ZeroMemory(&resEvents, sizeof(resEvents));
! 			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
! 				ereport(FATAL,
! 						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
! 
! 			if (forRead && resEvents.lNetworkEvents & FD_READ)
! 				result = 2;
! 			if (forWrite && resEvents.lNetworkEvents & FD_WRITE)
! 				result = 3;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
***************
*** 148,154 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	}
  
  	/* Clean up the handle we created for the socket */
! 		if (sock != PGINVALID_SOCKET)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 167,173 ----
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/replication/walreceiver.c
--- b/src/backend/replication/walreceiver.c
***************
*** 125,130 **** static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr);
--- 125,131 ----
  static void XLogWalRcvFlush(bool dying);
  static void XLogWalRcvSendReply(void);
  static void XLogWalRcvSendHSFeedback(void);
+ static void XLogWalRcvSendGUCChange(void);
  
  /* Signal handlers */
  static void WalRcvSigHupHandler(SIGNAL_ARGS);
***************
*** 276,281 **** WalReceiverMain(void)
--- 277,288 ----
  	walrcv_connect(conninfo, startpoint);
  	DisableWalRcvImmediateExit();
  
+ 	/*
+ 	 * Report the important parameters for streaming replication to
+ 	 * the primary.
+ 	 */
+ 	XLogWalRcvSendGUCChange();
+ 
  	/* Loop until end-of-streaming or error */
  	for (;;)
  	{
***************
*** 303,310 **** WalReceiverMain(void)
--- 310,325 ----
  
  		if (got_SIGHUP)
  		{
+ 			int	save_wal_receiver_status_interval =
+ 				wal_receiver_status_interval;
+ 
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/* If any of important parameters have changed, report them */
+ 			if (save_wal_receiver_status_interval !=
+ 					wal_receiver_status_interval)
+ 				XLogWalRcvSendGUCChange();
  		}
  
  		/* Wait a while for data to arrive */
***************
*** 702,704 **** XLogWalRcvSendHSFeedback(void)
--- 717,746 ----
  	memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage));
  	walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
  }
+ 
+ /*
+  * Send parameter change message to primary, plus the current time.
+  */
+ static void
+ XLogWalRcvSendGUCChange(void)
+ {
+ 	char			buf[sizeof(StandbyGUCChangeMessage) + 1];
+ 	TimestampTz		now;
+ 	StandbyGUCChangeMessage	guc_change_message;
+ 
+ 	/* Get current timestamp. */
+ 	now = GetCurrentTimestamp();
+ 
+ 	/* Construct a new message */
+ 	guc_change_message.wal_receiver_status_interval =
+ 		wal_receiver_status_interval;
+ 	guc_change_message.sendTime = now;
+ 
+ 	elog(DEBUG2, "sending parameter change wal_receiver_status_interval %d",
+ 			 wal_receiver_status_interval);
+ 
+ 	/* Prepend with the message type and send it. */
+ 	buf[0] = 'g';
+ 	memcpy(&buf[1], &guc_change_message, sizeof(StandbyGUCChangeMessage));
+ 	walrcv_send(buf, sizeof(StandbyGUCChangeMessage) + 1);
+ }
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 74,79 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 74,91 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 1000;	/* max sleep time between some actions */
+ int			replication_timeout = 0;	/* maximum time to send one WAL data message */
+ 
+ /*
+  * Buffer for WAL sending
+  *
+  * WalSndOutBuffer is a work area in which the output message is constructed.
+  * It's used in just so we can avoid re-palloc'ing the buffer on each cycle.
+  * It must be of size 6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
+  */
+ static char	   *WalSndOutBuffer;
+ static int		WalSndOutHead;		/* head of pending output */
+ static int		WalSndOutTail;		/* tail of pending output */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 95,100 **** static XLogRecPtr sentPtr = {0, 0};
--- 107,123 ----
   */
  static StringInfoData reply_message;
  
+ /*
+  * Timestamp of the last receipt of the reply from the standby.
+  */
+ static TimestampTz last_reply_timestamp;
+ 
+ /*
+  * The value of wal_receiver_status_interval on the standby.
+  * If this is zero, we disable replication timeout.
+  */
+ static int	standby_wal_receiver_status_interval = 10;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 113,124 **** static int	WalSndLoop(void);
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(char *msgbuf, bool *caughtup);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
  static void ProcessRepliesIfAny(void);
  
  
--- 136,149 ----
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(bool *caughtup, bool *pending);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
+ static void ProcessStandbyGUCChangeMessage(void);
+ static void ValidateReplicationTimeout(bool standby);
  static void ProcessRepliesIfAny(void);
  
  
***************
*** 214,219 **** WalSndHandshake(void)
--- 239,250 ----
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/*
+ 			 * Don't need to verify replication_timeout here because we
+ 			 * don't receive any parameter change message until we enter
+ 			 * streaming mode.
+ 			 */
  		}
  
  		if (firstchar != EOF)
***************
*** 467,472 **** ProcessRepliesIfAny(void)
--- 498,504 ----
  {
  	unsigned char firstchar;
  	int			r;
+ 	int		received = false;
  
  	for (;;)
  	{
***************
*** 479,487 **** ProcessRepliesIfAny(void)
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)
  		{
! 			/* no data available without blocking */
  			return;
  		}
  
--- 511,524 ----
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)	/* no data available without blocking */
  		{
! 			/*
! 			 * Save the last reply timestamp if we've received at least
! 			 * one reply.
! 			 */
! 			if (received)
! 				last_reply_timestamp = GetCurrentTimestamp();
  			return;
  		}
  
***************
*** 493,498 **** ProcessRepliesIfAny(void)
--- 530,536 ----
  				 */
  			case 'd':
  				ProcessStandbyMessage();
+ 				received = true;
  				break;
  
  				/*
***************
*** 511,517 **** ProcessRepliesIfAny(void)
  }
  
  /*
!  * Process a status update message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
--- 549,555 ----
  }
  
  /*
!  * Process a message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
***************
*** 547,552 **** ProcessStandbyMessage(void)
--- 585,594 ----
  			ProcessStandbyHSFeedbackMessage();
  			break;
  
+ 		case 'g':
+ 			ProcessStandbyGUCChangeMessage();
+ 			break;
+ 
  		default:
  			ereport(COMMERROR,
  					(errcode(ERRCODE_PROTOCOL_VIOLATION),
***************
*** 665,683 **** ProcessStandbyHSFeedbackMessage(void)
  	}
  }
  
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
- 	char	   *output_message;
  	bool		caughtup = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
--- 707,805 ----
  	}
  }
  
+ /*
+  * Parameter change report from standby
+  */
+ static void
+ ProcessStandbyGUCChangeMessage(void)
+ {
+ 	StandbyGUCChangeMessage	reply;
+ 
+ 	pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyGUCChangeMessage));
+ 
+ 	elog(DEBUG2, "standby's wal_receiver_status_interval %d",
+ 			 reply.wal_receiver_status_interval);
+ 
+ 	/* If we don't use replication timeout, we don't need to do any more */
+ 	if (replication_timeout <= 0)
+ 		return;
+ 
+ 	/*
+ 	 * If wal_receiver_status_interval is zero in the standby,
+ 	 * we ignore replication timeout for that connection.
+ 	 */
+ 	if (standby_wal_receiver_status_interval > 0 &&
+ 			reply.wal_receiver_status_interval <= 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is disabled because wal_receiver_status_interval is zero on the standby")));
+ 	}
+ 	else if (standby_wal_receiver_status_interval <= 0 &&
+ 					 reply.wal_receiver_status_interval > 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is enabled because wal_receiver_status_interval is greater than zero on the standby")));
+ 	}
+ 
+ 	standby_wal_receiver_status_interval = reply.wal_receiver_status_interval;
+ 
+ 	/* Verify whether replication_timeout is large enough */
+ 	ValidateReplicationTimeout(true);
+ }
+ 
+ /*
+  * Verify whether replication_timeout is large enough for the timeout.
+  *
+  * This is called when either replication_timeout is changed on the
+  * master or wal_receiver_status_interval is changed on the standby.
+  * In the former case, 'standby' must be set to FALSE, otherwise TRUE.
+  */
+ static void
+ ValidateReplicationTimeout(bool standby)
+ {
+ 	static bool	skip_validation = true;
+ 
+ 	/*
+ 	 * If we've not received any parameter change message from the standby
+ 	 * yet, we cannot verify replication_timeout properly. In that case,
+ 	 * we postpone the validation until the message has arrived. This can
+ 	 * happen only when replication_timeout is changed by SIGHUP signal
+ 	 * before any parameter change message arrives.
+ 	 */
+ 	if (standby)
+ 		skip_validation = false;
+ 
+ 	if (skip_validation)
+ 		return;
+ 
+ 	/*
+ 	 * Emit WARNING message if replication_timeout in the primary
+ 	 * is less than wal_receiver_status_interval in the standby
+ 	 * because unexpected timeout can happen in that case.
+ 	 */
+ 	if (replication_timeout > 0 &&
+ 			standby_wal_receiver_status_interval > 0 &&
+ 			replication_timeout <= standby_wal_receiver_status_interval * 1000)
+ 		ereport(WARNING,
+ 						(errmsg("replication can be terminated unexpectedly because replication_timeout (%d milliseconds) on the master is less than wal_receiver_status_interval (%d seconds) on the standby",
+ 										replication_timeout, standby_wal_receiver_status_interval),
+ 						 errhint("Either increase replication_timeout on the master, or decrease wal_receiver_status_interval on the standby.")));
+ }
+ 
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
  	bool		caughtup = false;
+ 	bool		pending = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	WalSndOutBuffer = palloc(6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
! 	WalSndOutHead = WalSndOutTail = 0;
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
***************
*** 685,690 **** WalSndLoop(void)
--- 807,815 ----
  	 */
  	initStringInfo(&reply_message);
  
+ 	/* Initialize the last reply timestamp */
+ 	last_reply_timestamp = GetCurrentTimestamp();
+ 
  	/* Loop forever, unless we get an error */
  	for (;;)
  	{
***************
*** 700,705 **** WalSndLoop(void)
--- 825,833 ----
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/* Verify whether replication_timeout is large enough */
+ 			ValidateReplicationTimeout(false);
  		}
  
  		/*
***************
*** 708,717 **** WalSndLoop(void)
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup)
  				walsender_shutdown_requested = true;
  		}
  
--- 836,845 ----
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup && !pending)
  				walsender_shutdown_requested = true;
  		}
  
***************
*** 726,735 **** WalSndLoop(void)
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round, nap for the
! 		 * configured time before retrying.
  		 */
! 		if (caughtup)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
--- 854,864 ----
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round or could not
! 		 * flush pending WAL in output buffer because the socket was not
! 		 * writable, nap for the configured time before retrying.
  		 */
! 		if (caughtup || pending)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
***************
*** 740,764 **** WalSndLoop(void)
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  		}
  
--- 869,931 ----
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(&caughtup, &pending))
  				break;
! 			if ((caughtup || pending) && !got_SIGHUP && !walsender_ready_to_stop &&
! 					!walsender_shutdown_requested)
  			{
+ 				TimestampTz	finish_time;
+ 				long		sleeptime;
+ 
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
+ 				/* Reschedule replication timeout */
+ 				if (replication_timeout > 0 &&
+ 						standby_wal_receiver_status_interval > 0)
+ 				{
+ 					long		secs;
+ 					int		usecs;
+ 
+ 					finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+ 											replication_timeout);
+ 					TimestampDifference(GetCurrentTimestamp(),
+ 								finish_time, &secs, &usecs);
+ 					sleeptime = secs * 1000 + usecs / 1000;
+ 					if (WalSndDelay < sleeptime)
+ 						sleeptime = WalSndDelay;
+ 				}
+ 				else
+ 					sleeptime = WalSndDelay;
+ 
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  true, (WalSndOutTail > 0),
! 								  sleeptime * 1000L);
! 
! 				/* Check for replication timeout */
! 				if (replication_timeout > 0 &&
! 						standby_wal_receiver_status_interval > 0 &&
! 						GetCurrentTimestamp() >= finish_time)
! 				{
! 					/*
! 					 * Since typically expiration of replication timeout means
! 					 * communication problem, we don't send the error message
! 					 * to the standby.
! 					 */
! 					ereport(COMMERROR,
! 							(errmsg("terminating walsender process due to replication timeout")));
! 					break;
! 				}
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  		}
  
***************
*** 986,1009 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
-  * msgbuf is a work area in which the output message is constructed.  It's
-  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
-  * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
-  *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(char *msgbuf, bool *caughtup)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	XLogRecPtr	endptr;
  	Size		nbytes;
  	WalDataMessageHeader msghdr;
  
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
--- 1153,1200 ----
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
+  * If there is pending WAL in output buffer, *pending is set to true,
+  * otherwise *pending is set to false.
+  *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(bool *caughtup, bool *pending)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	static XLogRecPtr	endptr;
  	Size		nbytes;
+ 	uint32		n32;
+ 	int			res;
  	WalDataMessageHeader msghdr;
  
+ 	/* Attempt to flush pending WAL in output buffer */
+ 	if (*pending)
+ 	{
+ 		if (WalSndOutHead != WalSndOutTail)
+ 		{
+ 			res = pq_putbytes_if_writable(WalSndOutBuffer + WalSndOutHead,
+ 										  WalSndOutTail - WalSndOutHead);
+ 			if (res == EOF)
+ 				return false;
+ 			WalSndOutHead += res;
+ 			if (WalSndOutHead != WalSndOutTail)
+ 				return true;
+ 		}
+ 
+ 		res = pq_flush_if_writable();
+ 		if (res == EOF)
+ 			return false;
+ 		if (res == 0)
+ 			return true;
+ 
+ 		goto updt;
+ 	}
+ 
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
***************
*** 1072,1084 **** XLogSend(char *msgbuf, bool *caughtup)
  	/*
  	 * OK to read and send the slice.
  	 */
! 	msgbuf[0] = 'w';
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
--- 1263,1281 ----
  	/*
  	 * OK to read and send the slice.
  	 */
! 	WalSndOutBuffer[0] = 'd';
! 	WalSndOutBuffer[5] = 'w';
! 	WalSndOutHead = 0;
! 	WalSndOutTail = 6 + sizeof(WalDataMessageHeader) + nbytes;
! 
! 	n32 = htonl((uint32) WalSndOutTail - 1);
! 	memcpy(WalSndOutBuffer + 1, &n32, 4);
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(WalSndOutBuffer + 6 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
***************
*** 1088,1100 **** XLogSend(char *msgbuf, bool *caughtup)
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
  
! 	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
  
  	/* Flush pending output to the client */
! 	if (pq_flush())
  		return false;
  
  	sentPtr = endptr;
  
--- 1285,1318 ----
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(WalSndOutBuffer + 6, &msghdr, sizeof(WalDataMessageHeader));
! 
! 	res = pq_putbytes_if_writable(WalSndOutBuffer, WalSndOutTail);
! 	if (res == EOF)
! 		return false;
  
! 	WalSndOutHead = res;
! 	if (WalSndOutHead != WalSndOutTail)
! 	{
! 		*caughtup = false;
! 		*pending = true;
! 		return true;
! 	}
  
  	/* Flush pending output to the client */
! 	res = pq_flush_if_writable();
! 	if (res == EOF)
  		return false;
+ 	if (res == 0)
+ 	{
+ 		*caughtup = false;
+ 		*pending = true;
+ 		return true;
+ 	}
+ 
+ updt:
+ 	WalSndOutHead = WalSndOutTail = 0;
+ 	*pending = false;
  
  	sentPtr = endptr;
  
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1847,1852 **** static struct config_int ConfigureNamesInt[] =
--- 1847,1862 ----
  	},
  
  	{
+ 		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum time to wait for WAL replication."),
+ 			NULL,
+ 			GUC_UNIT_MS
+ 		},
+ 		&replication_timeout,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 191,196 ****
--- 191,197 ----
  #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
  #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+ #replication_timeout = 0 # in milliseconds, 0 is disabled
  
  # - Standby Servers -
  
*** a/src/include/libpq/libpq.h
--- b/src/include/libpq/libpq.h
***************
*** 59,65 **** extern int	pq_getbyte(void);
--- 59,67 ----
  extern int	pq_peekbyte(void);
  extern int	pq_getbyte_if_available(unsigned char *c);
  extern int	pq_putbytes(const char *s, size_t len);
+ extern int	pq_putbytes_if_writable(const char *s, size_t len);
  extern int	pq_flush(void);
+ extern int	pq_flush_if_writable(void);
  extern int	pq_putmessage(char msgtype, const char *s, size_t len);
  extern void pq_startcopyout(void);
  extern void pq_endcopyout(bool errorAbort);
*** a/src/include/replication/walprotocol.h
--- b/src/include/replication/walprotocol.h
***************
*** 81,86 **** typedef struct
--- 81,104 ----
  } StandbyHSFeedbackMessage;
  
  /*
+  * GUC parameter change report from standby (message type 'g').  This is wrapped
+  * within a CopyData message at the FE/BE protocol level.
+  *
+  * Note that the data length is not specified here.
+  */
+ typedef struct
+ {
+ 	/*
+ 	 * Only change of important parameters for streaming replication needs
+ 	 * to be reported.
+ 	 */
+ 	int			wal_receiver_status_interval;
+ 
+ 	/* Sender's system clock at the time of transmission */
+ 	TimestampTz sendTime;
+ } StandbyGUCChangeMessage;
+ 
+ /*
   * Maximum data payload in a WAL data message.	Must be >= XLOG_BLCKSZ.
   *
   * We don't have a good idea of what a good value would be; there's some
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 70,75 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 70,76 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 40,46 **** extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
--- 40,46 ----
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#22Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#21)
Re: Replication server timeout patch

On Mon, Feb 28, 2011 at 8:08 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Feb 27, 2011 at 11:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

There are two things that I think are pretty clear.  If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection.

The patch still doesn't check that wal_receiver_status_interval
is set up properly. I'll implement that later.

Done. I attached the updated patch.

Why does internal_flush_if_writable compute bufptr differently from
internal_flush? And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code. Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state. You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end. Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#23Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#22)
Re: Replication server timeout patch

On Sun, Mar 6, 2011 at 3:23 AM, Robert Haas <robertmhaas@gmail.com> wrote:

On Mon, Feb 28, 2011 at 8:08 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Feb 27, 2011 at 11:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

There are two things that I think are pretty clear.  If the receiver
has wal_receiver_status_interval=0, then we should ignore
replication_timeout for that connection.

The patch still doesn't check that wal_receiver_status_interval
is set up properly. I'll implement that later.

Done. I attached the updated patch.

Why does internal_flush_if_writable compute bufptr differently from
internal_flush?  And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code.  Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state.  You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end.  Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

All these comments seem to make sense. Will fix. Thanks!

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#24Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#23)
1 attachment(s)
Re: Replication server timeout patch

On Sun, Mar 6, 2011 at 5:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does internal_flush_if_writable compute bufptr differently from
internal_flush?  And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code.  Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state.  You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end.  Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

All these comments seem to make sense. Will fix. Thanks!

Done. I attached the updated patch.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_timeout_v4.patchtext/x-diff; charset=US-ASCII; name=replication_timeout_v4.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2015,2020 **** SET ENABLE_SEQSCAN TO OFF;
--- 2015,2044 ----
         </para>
        </listitem>
       </varlistentry>
+ 
+      <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+       <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the maximum time, in milliseconds, to wait for the reply
+         from the standby before terminating replication.  This is useful for
+         the primary server to detect the standby crash or network outage.
+         A value of zero (the default) turns this off.  This parameter can
+         only be set in the <filename>postgresql.conf</> file or on the server
+         command line.
+        </para>
+        <para>
+         To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+         must be enabled on the standby, and its value must be less than the
+         value of <varname>replication_timeout</>.
+         If <varname>wal_receiver_status_interval</> is zero on the standby,
+         replication timeout is disabled for that connection.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
  
***************
*** 2125,2130 **** SET ENABLE_SEQSCAN TO OFF;
--- 2149,2159 ----
         the <filename>postgresql.conf</> file or on the server command line.
         The default value is 10 seconds.
        </para>
+       <para>
+        When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+        <varname>wal_receiver_status_interval</> must be enabled, and its value
+        must be less than the value of <varname>replication_timeout</>.
+       </para>
        </listitem>
       </varlistentry>
  
*** a/doc/src/sgml/protocol.sgml
--- b/doc/src/sgml/protocol.sgml
***************
*** 1480,1485 **** The commands accepted in walsender mode are:
--- 1480,1533 ----
        <variablelist>
        <varlistentry>
        <term>
+           Standby parameter change (F)
+       </term>
+       <listitem>
+       <para>
+       <variablelist>
+       <varlistentry>
+       <term>
+           Byte1('g')
+       </term>
+       <listitem>
+       <para>
+           Identifies the message as a receiver parameter change.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Int32
+       </term>
+       <listitem>
+       <para>
+           The current value of wal_receiver_status_interval
+           in the standby.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Byte8
+       </term>
+       <listitem>
+       <para>
+           The server's system clock at the time of transmission,
+           given in TimestampTz format.
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+      </para>
+ 
+      <para>
+       <variablelist>
+       <varlistentry>
+       <term>
            Standby status update (F)
        </term>
        <listitem>
*** a/src/backend/libpq/pqcomm.c
--- b/src/backend/libpq/pqcomm.c
***************
*** 56,61 ****
--- 56,64 ----
   *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
   *		pq_flush		- flush pending output
   *		pq_getbyte_if_available - get a byte if available without blocking
+  *		pq_putbytes_if_writable	- send bytes to connection if writable without blocking
+  *		pq_flush_if_writable	- flush pending output if writable without blocking
+  *		pq_set_nonblocking	- set socket blocking/non-blocking
   *
   * message-level I/O (and old-style-COPY-OUT cruft):
   *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
***************
*** 112,117 **** static char sock_path[MAXPGPATH];
--- 115,121 ----
  
  static char PqSendBuffer[PQ_BUFFER_SIZE];
  static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+ static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
  
  static char PqRecvBuffer[PQ_BUFFER_SIZE];
  static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
***************
*** 126,133 **** static bool DoingCopyOut;
  
  /* Internal functions */
  static void pq_close(int code, Datum arg);
! static int	internal_putbytes(const char *s, size_t len);
! static int	internal_flush(void);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
--- 130,138 ----
  
  /* Internal functions */
  static void pq_close(int code, Datum arg);
! static int	internal_putbytes(const char *s, size_t len, bool nonblocking);
! static int	internal_flush(bool nonblocking);
! static void pq_set_nonblocking(bool nonblocking, int emode);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
***************
*** 142,148 **** static int	Setup_AF_UNIX(void);
  void
  pq_init(void)
  {
! 	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
--- 147,153 ----
  void
  pq_init(void)
  {
! 	PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
***************
*** 846,859 **** pq_getbyte_if_available(unsigned char *c)
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! #ifdef WIN32
! 	pgwin32_noblock = 1;
! #else
! 	if (!pg_set_noblock(MyProcPort->sock))
! 		ereport(ERROR,
! 				(errmsg("could not set socket to non-blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = true;
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
--- 851,857 ----
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! 	pq_set_nonblocking(true, ERROR);
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
***************
*** 892,916 **** pq_getbyte_if_available(unsigned char *c)
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! #ifdef WIN32
! 		pgwin32_noblock = 0;
! #else
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(FATAL,
! 					(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 		MyProcPort->noblock = false;
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! #ifdef WIN32
! 	pgwin32_noblock = 0;
! #else
! 	if (!pg_set_block(MyProcPort->sock))
! 		ereport(FATAL,
! 				(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = false;
  
  	return r;
  }
--- 890,900 ----
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! 		pq_set_nonblocking(false, FATAL);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! 	pq_set_nonblocking(false, FATAL);
  
  	return r;
  }
***************
*** 1125,1146 **** pq_putbytes(const char *s, size_t len)
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_putbytes(s, len);
  	PqCommBusy = false;
  	return res;
  }
  
  static int
! internal_putbytes(const char *s, size_t len)
  {
  	size_t		amount;
  
  	while (len > 0)
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 			if (internal_flush())
  				return EOF;
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
  			amount = len;
--- 1109,1169 ----
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_putbytes(s, len, false);
! 	PqCommBusy = false;
! 	return (res == EOF) ? EOF : 0;
! }
! 
! /* --------------------------------
!  *		pq_putbytes_if_writable - send bytes to connection (not flushed
!  *			until pq_flush), if writable
!  *
!  * Returns the number of bytes written without blocking, or EOF if trouble.
!  * --------------------------------
!  */
! int
! pq_putbytes_if_writable(const char *s, size_t len)
! {
! 	int			res;
! 
! 	/* Should not be called by old-style COPY OUT */
! 	Assert(!DoingCopyOut);
! 	/* No-op if reentrant call */
! 	if (PqCommBusy)
! 		return 0;
! 	PqCommBusy = true;
! 	res = internal_putbytes(s, len, true);
  	PqCommBusy = false;
  	return res;
  }
  
+ /* --------------------------------
+  *		internal_putbytes - send bytes to connection (not flushed
+  *			until pq_flush)
+  *
+  * Returns the number of bytes written (can be less than len only if
+  * nonblocking is true), or EOF if trouble.
+  * --------------------------------
+  */
  static int
! internal_putbytes(const char *s, size_t len, bool nonblocking)
  {
  	size_t		amount;
+ 	size_t		nwritten = 0;
  
  	while (len > 0)
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 		{
! 			int		r;
! 
! 			r = internal_flush(nonblocking);
! 			if (r == 0)	/* Only possible if nonblocking is true */
! 				break;
! 			if (r == EOF)
  				return EOF;
+ 		}
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
  			amount = len;
***************
*** 1148,1155 **** internal_putbytes(const char *s, size_t len)
  		PqSendPointer += amount;
  		s += amount;
  		len -= amount;
  	}
! 	return 0;
  }
  
  /* --------------------------------
--- 1171,1179 ----
  		PqSendPointer += amount;
  		s += amount;
  		len -= amount;
+ 		nwritten += amount;
  	}
! 	return (int) nwritten;
  }
  
  /* --------------------------------
***************
*** 1167,1227 **** pq_flush(void)
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush();
  	PqCommBusy = false;
  	return res;
  }
  
  static int
! internal_flush(void)
  {
  	static int	last_reported_send_errno = 0;
  
! 	char	   *bufptr = PqSendBuffer;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	while (bufptr < bufend)
  	{
! 		int			r;
! 
! 		r = secure_write(MyProcPort, bufptr, bufend - bufptr);
! 
! 		if (r <= 0)
  		{
! 			if (errno == EINTR)
! 				continue;		/* Ok if we were interrupted */
  
! 			/*
! 			 * Careful: an ereport() that tries to write to the client would
! 			 * cause recursion to here, leading to stack overflow and core
! 			 * dump!  This message must go *only* to the postmaster log.
! 			 *
! 			 * If a client disconnects while we're in the midst of output, we
! 			 * might write quite a bit of data before we get to a safe query
! 			 * abort point.  So, suppress duplicate log messages.
! 			 */
! 			if (errno != last_reported_send_errno)
  			{
! 				last_reported_send_errno = errno;
! 				ereport(COMMERROR,
! 						(errcode_for_socket_access(),
! 						 errmsg("could not send data to client: %m")));
  			}
  
! 			/*
! 			 * We drop the buffered data anyway so that processing can
! 			 * continue, even though we'll probably quit soon.
! 			 */
! 			PqSendPointer = 0;
! 			return EOF;
  		}
- 
- 		last_reported_send_errno = 0;	/* reset after any successful send */
- 		bufptr += r;
  	}
  
! 	PqSendPointer = 0;
! 	return 0;
  }
  
  
--- 1191,1352 ----
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush(false);
! 	PqCommBusy = false;
! 	return (res == EOF) ? EOF : 0;
! }
! 
! /* --------------------------------
!  *		pq_flush_if_writable - flush pending output if writable
!  *
!  * Returns 1 if OK, 0 if pending output cannot be written without blocking,
!  * or EOF if trouble.
!  * --------------------------------
!  */
! int
! pq_flush_if_writable(void)
! {
! 	int			res;
! 
! 	/* No-op if reentrant call */
! 	if (PqCommBusy)
! 		return 0;
! 	PqCommBusy = true;
! 	res = internal_flush(true);
  	PqCommBusy = false;
  	return res;
  }
  
+ /* --------------------------------
+  *		internal_flush - flush pending output
+  *
+  * Returns 1 if OK, 0 if pending output cannot be written without blocking
+  * (only possible if nonblocking is true), or EOF if trouble.
+  * --------------------------------
+  */
  static int
! internal_flush(bool nonblocking)
  {
  	static int	last_reported_send_errno = 0;
+ 	int		r;
  
! 	char	   *bufptr = PqSendBuffer + PqSendStart;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	/* Temporarily put the socket into non-blocking mode */
! 	if (nonblocking)
! 		pq_set_nonblocking(true, ERROR);
! 	PG_TRY();
  	{
! 		while (bufptr < bufend)
  		{
! 			r = secure_write(MyProcPort, bufptr, bufend - bufptr);
  
! 			if (r <= 0)
  			{
! 				/* Ok if we were interrupted in blocking mode */
! 				if (!nonblocking && errno == EINTR)
! 					continue;
! 
! 				if (nonblocking)
! 				{
! 					if (r == 0)
! 						r = EOF;	/* EOF detected */
! 					else if (errno == EAGAIN ||
! 							 errno == EWOULDBLOCK ||
! 							 errno == EINTR)
! 					{
! 						/*
! 						 * Ok if no data writable without blocking or
! 						 * interrupted (though EINTR really shouldn't
! 						 * happen with a non-blocking socket). Report
! 						 * other errors.
! 						 */
! 						r = 0;
! 					}
! 					break;
! 				}
! 
! 				/*
! 				 * Careful: an ereport() that tries to write to the
! 				 * client would cause recursion to here, leading to
! 				 * stack overflow and core dump!  This message must
! 				 * go *only* to the postmaster log.
! 				 *
! 				 * If a client disconnects while we're in the midst
! 				 * of output, we might write quite a bit of data before
! 				 * we get to a safe query abort point.  So, suppress
! 				 * duplicate log messages.
! 				 */
! 				if (errno != last_reported_send_errno)
! 				{
! 					last_reported_send_errno = errno;
! 					ereport(COMMERROR,
! 							(errcode_for_socket_access(),
! 							 errmsg("could not send data to client: %m")));
! 				}
! 
! 				/*
! 				 * We drop the buffered data anyway so that processing can
! 				 * continue, even though we'll probably quit soon.
! 				 */
! 				PqSendStart = PqSendPointer = 0;
! 				r = EOF;
! 				break;
  			}
  
! 			last_reported_send_errno = 0;	/* reset after any successful send */
! 			bufptr += r;
! 			PqSendStart += r;
  		}
  	}
+ 	PG_CATCH();
+ 	{
+ 		/*
+ 		 * The rest of the backend code assumes the socket is in blocking
+ 		 * mode, so treat failure as FATAL.
+ 		 */
+ 		if (nonblocking)
+ 			pq_set_nonblocking(false, FATAL);
+ 	}
+ 	PG_END_TRY();
+ 	if (nonblocking)
+ 		pq_set_nonblocking(false, FATAL);
  
! 	if (r == 0 || r == EOF)
! 		return r;
! 
! 	PqSendStart = PqSendPointer = 0;
! 	return 1;
! }
! 
! /* --------------------------------
!  *		pq_set_nonblocking - set socket blocking/non-blocking
!  *
!  * Sets the socket non-blocking if nonblocking is TRUE, or sets it
!  * blocking otherwise.
!  * --------------------------------
!  */
! static
! void pq_set_nonblocking(bool nonblocking, int emode)
! {
! #ifdef WIN32
! 	pgwin32_noblock = nonblocking ? 1 : 0;
! #else
! 	if (nonblocking)
! 	{
! 		if (!pg_set_noblock(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to non-blocking mode: %m")));
! 	}
! 	else
! 	{
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to blocking mode: %m")));
! 	}
! #endif
! 	MyProcPort->noblock = nonblocking;
  }
  
  
***************
*** 1265,1281 **** pq_putmessage(char msgtype, const char *s, size_t len)
  		return 0;
  	PqCommBusy = true;
  	if (msgtype)
! 		if (internal_putbytes(&msgtype, 1))
  			goto fail;
  	if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3)
  	{
  		uint32		n32;
  
  		n32 = htonl((uint32) (len + 4));
! 		if (internal_putbytes((char *) &n32, 4))
  			goto fail;
  	}
! 	if (internal_putbytes(s, len))
  		goto fail;
  	PqCommBusy = false;
  	return 0;
--- 1390,1406 ----
  		return 0;
  	PqCommBusy = true;
  	if (msgtype)
! 		if (internal_putbytes(&msgtype, 1, false) == EOF)
  			goto fail;
  	if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3)
  	{
  		uint32		n32;
  
  		n32 = htonl((uint32) (len + 4));
! 		if (internal_putbytes((char *) &n32, 4, false) == EOF)
  			goto fail;
  	}
! 	if (internal_putbytes(s, len, false) == EOF)
  		goto fail;
  	PqCommBusy = false;
  	return 0;
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 193,211 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
!  * was set, or 2 if the scoket became readable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
  	int			rc;
  	int			result = 0;
  
--- 193,214 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the scoket became readable, or 3 if
!  * the socket became writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
+ 	fd_set		output_mask;
  	int			rc;
  	int			result = 0;
  
***************
*** 241,254 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 244,265 ----
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
! 		{
! 			FD_SET(sock, &output_mask);
! 			if (sock > hifd)
! 				hifd = sock;
! 		}
! 
! 		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 263,273 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
  	}
  	waiting = false;
  
--- 274,291 ----
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && forRead &&
! 			FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
+ 		if (sock != PGINVALID_SOCKET && forWrite &&
+ 			FD_ISSET(sock, &output_mask))
+ 		{
+ 			result = 3;
+ 			break;		/* data writable in socket */
+ 		}
  	}
  	waiting = false;
  
*** a/src/backend/port/win32/socket.c
--- b/src/backend/port/win32/socket.c
***************
*** 14,20 ****
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() should operate in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
--- 14,21 ----
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() and pgwin32_send() should operate
!  * in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
***************
*** 399,404 **** pgwin32_send(SOCKET s, char *buf, int len, int flags)
--- 400,415 ----
  			return -1;
  		}
  
+ 		if (pgwin32_noblock)
+ 		{
+ 			/*
+ 			 * No data sent, and we are in "emulated non-blocking mode", so
+ 			 * return indicating that we'd block if we were to continue.
+ 			 */
+ 			errno = EWOULDBLOCK;
+ 			return -1;
+ 		}
+ 
  		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
  
  		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 85,95 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
--- 85,96 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
***************
*** 103,112 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET)
  	{
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, FD_READ);
  		events[numevents++] = sockevent;
  	}
  
--- 104,120 ----
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
+ 		int		flags = 0;
+ 
+ 		if (forRead)
+ 			flags |= FD_READ;
+ 		if (forWrite)
+ 			flags |= FD_WRITE;
+ 
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
***************
*** 139,146 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			Assert(sock != PGINVALID_SOCKET);
! 			result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
--- 147,165 ----
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
+ 			WSANETWORKEVENTS resEvents;
+ 
  			Assert(sock != PGINVALID_SOCKET);
! 
! 			ZeroMemory(&resEvents, sizeof(resEvents));
! 			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
! 				ereport(FATAL,
! 						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
! 
! 			if (forRead && resEvents.lNetworkEvents & FD_READ)
! 				result = 2;
! 			if (forWrite && resEvents.lNetworkEvents & FD_WRITE)
! 				result = 3;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
***************
*** 148,154 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	}
  
  	/* Clean up the handle we created for the socket */
! 		if (sock != PGINVALID_SOCKET)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 167,173 ----
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/replication/walreceiver.c
--- b/src/backend/replication/walreceiver.c
***************
*** 125,130 **** static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr);
--- 125,131 ----
  static void XLogWalRcvFlush(bool dying);
  static void XLogWalRcvSendReply(void);
  static void XLogWalRcvSendHSFeedback(void);
+ static void XLogWalRcvSendGUCChange(void);
  
  /* Signal handlers */
  static void WalRcvSigHupHandler(SIGNAL_ARGS);
***************
*** 276,281 **** WalReceiverMain(void)
--- 277,288 ----
  	walrcv_connect(conninfo, startpoint);
  	DisableWalRcvImmediateExit();
  
+ 	/*
+ 	 * Report the important parameters for streaming replication to
+ 	 * the primary.
+ 	 */
+ 	XLogWalRcvSendGUCChange();
+ 
  	/* Loop until end-of-streaming or error */
  	for (;;)
  	{
***************
*** 303,310 **** WalReceiverMain(void)
--- 310,325 ----
  
  		if (got_SIGHUP)
  		{
+ 			int	save_wal_receiver_status_interval =
+ 				wal_receiver_status_interval;
+ 
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/* If any of important parameters have changed, report them */
+ 			if (save_wal_receiver_status_interval !=
+ 					wal_receiver_status_interval)
+ 				XLogWalRcvSendGUCChange();
  		}
  
  		/* Wait a while for data to arrive */
***************
*** 705,707 **** XLogWalRcvSendHSFeedback(void)
--- 720,749 ----
  	memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage));
  	walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
  }
+ 
+ /*
+  * Send parameter change message to primary, plus the current time.
+  */
+ static void
+ XLogWalRcvSendGUCChange(void)
+ {
+ 	char			buf[sizeof(StandbyGUCChangeMessage) + 1];
+ 	TimestampTz		now;
+ 	StandbyGUCChangeMessage	guc_change_message;
+ 
+ 	/* Get current timestamp. */
+ 	now = GetCurrentTimestamp();
+ 
+ 	/* Construct a new message */
+ 	guc_change_message.wal_receiver_status_interval =
+ 		wal_receiver_status_interval;
+ 	guc_change_message.sendTime = now;
+ 
+ 	elog(DEBUG2, "sending parameter change wal_receiver_status_interval %d",
+ 			 wal_receiver_status_interval);
+ 
+ 	/* Prepend with the message type and send it. */
+ 	buf[0] = 'g';
+ 	memcpy(&buf[1], &guc_change_message, sizeof(StandbyGUCChangeMessage));
+ 	walrcv_send(buf, sizeof(StandbyGUCChangeMessage) + 1);
+ }
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 74,79 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 74,91 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 1000;	/* max sleep time between some actions */
+ int			replication_timeout = 0;	/* maximum time to send one WAL data message */
+ 
+ /*
+  * Buffer for WAL sending
+  *
+  * WalSndOutBuffer is a work area in which the output message is constructed.
+  * It's used in just so we can avoid re-palloc'ing the buffer on each cycle.
+  * It must be of size 6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
+  */
+ static char	   *WalSndOutBuffer;
+ static int		WalSndOutHead;		/* head of pending output */
+ static int		WalSndOutTail;		/* tail of pending output */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 95,100 **** static XLogRecPtr sentPtr = {0, 0};
--- 107,123 ----
   */
  static StringInfoData reply_message;
  
+ /*
+  * Timestamp of the last receipt of the reply from the standby.
+  */
+ static TimestampTz last_reply_timestamp;
+ 
+ /*
+  * The value of wal_receiver_status_interval on the standby.
+  * If this is zero, we disable replication timeout.
+  */
+ static int	standby_wal_receiver_status_interval = 10;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 113,124 **** static int	WalSndLoop(void);
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(char *msgbuf, bool *caughtup);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
  static void ProcessRepliesIfAny(void);
  
  
--- 136,149 ----
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(bool *caughtup, bool *pending);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
+ static void ProcessStandbyGUCChangeMessage(void);
+ static void ValidateReplicationTimeout(bool standby);
  static void ProcessRepliesIfAny(void);
  
  
***************
*** 214,219 **** WalSndHandshake(void)
--- 239,250 ----
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/*
+ 			 * Don't need to verify replication_timeout here because we
+ 			 * don't receive any parameter change message until we enter
+ 			 * streaming mode.
+ 			 */
  		}
  
  		if (firstchar != EOF)
***************
*** 467,472 **** ProcessRepliesIfAny(void)
--- 498,504 ----
  {
  	unsigned char firstchar;
  	int			r;
+ 	int		received = false;
  
  	for (;;)
  	{
***************
*** 479,487 **** ProcessRepliesIfAny(void)
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)
  		{
! 			/* no data available without blocking */
  			return;
  		}
  
--- 511,524 ----
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)	/* no data available without blocking */
  		{
! 			/*
! 			 * Save the last reply timestamp if we've received at least
! 			 * one reply.
! 			 */
! 			if (received)
! 				last_reply_timestamp = GetCurrentTimestamp();
  			return;
  		}
  
***************
*** 493,498 **** ProcessRepliesIfAny(void)
--- 530,536 ----
  				 */
  			case 'd':
  				ProcessStandbyMessage();
+ 				received = true;
  				break;
  
  				/*
***************
*** 511,517 **** ProcessRepliesIfAny(void)
  }
  
  /*
!  * Process a status update message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
--- 549,555 ----
  }
  
  /*
!  * Process a message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
***************
*** 547,552 **** ProcessStandbyMessage(void)
--- 585,594 ----
  			ProcessStandbyHSFeedbackMessage();
  			break;
  
+ 		case 'g':
+ 			ProcessStandbyGUCChangeMessage();
+ 			break;
+ 
  		default:
  			ereport(COMMERROR,
  					(errcode(ERRCODE_PROTOCOL_VIOLATION),
***************
*** 665,683 **** ProcessStandbyHSFeedbackMessage(void)
  	}
  }
  
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
- 	char	   *output_message;
  	bool		caughtup = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
--- 707,805 ----
  	}
  }
  
+ /*
+  * Parameter change report from standby
+  */
+ static void
+ ProcessStandbyGUCChangeMessage(void)
+ {
+ 	StandbyGUCChangeMessage	reply;
+ 
+ 	pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyGUCChangeMessage));
+ 
+ 	elog(DEBUG2, "standby's wal_receiver_status_interval %d",
+ 			 reply.wal_receiver_status_interval);
+ 
+ 	/* If we don't use replication timeout, we don't need to do any more */
+ 	if (replication_timeout <= 0)
+ 		return;
+ 
+ 	/*
+ 	 * If wal_receiver_status_interval is zero in the standby,
+ 	 * we ignore replication timeout for that connection.
+ 	 */
+ 	if (standby_wal_receiver_status_interval > 0 &&
+ 			reply.wal_receiver_status_interval <= 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is disabled because wal_receiver_status_interval is zero on the standby")));
+ 	}
+ 	else if (standby_wal_receiver_status_interval <= 0 &&
+ 					 reply.wal_receiver_status_interval > 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is enabled because wal_receiver_status_interval is greater than zero on the standby")));
+ 	}
+ 
+ 	standby_wal_receiver_status_interval = reply.wal_receiver_status_interval;
+ 
+ 	/* Verify whether replication_timeout is large enough */
+ 	ValidateReplicationTimeout(true);
+ }
+ 
+ /*
+  * Verify whether replication_timeout is large enough for the timeout.
+  *
+  * This is called when either replication_timeout is changed on the
+  * master or wal_receiver_status_interval is changed on the standby.
+  * In the former case, 'standby' must be set to FALSE, otherwise TRUE.
+  */
+ static void
+ ValidateReplicationTimeout(bool standby)
+ {
+ 	static bool	skip_validation = true;
+ 
+ 	/*
+ 	 * If we've not received any parameter change message from the standby
+ 	 * yet, we cannot verify replication_timeout properly. In that case,
+ 	 * we postpone the validation until the message has arrived. This can
+ 	 * happen only when replication_timeout is changed by SIGHUP signal
+ 	 * before any parameter change message arrives.
+ 	 */
+ 	if (standby)
+ 		skip_validation = false;
+ 
+ 	if (skip_validation)
+ 		return;
+ 
+ 	/*
+ 	 * Emit WARNING message if replication_timeout in the primary
+ 	 * is less than wal_receiver_status_interval in the standby
+ 	 * because unexpected timeout can happen in that case.
+ 	 */
+ 	if (replication_timeout > 0 &&
+ 			standby_wal_receiver_status_interval > 0 &&
+ 			replication_timeout <= standby_wal_receiver_status_interval * 1000)
+ 		ereport(WARNING,
+ 						(errmsg("replication can be terminated unexpectedly because replication_timeout (%d milliseconds) on the master is less than wal_receiver_status_interval (%d seconds) on the standby",
+ 										replication_timeout, standby_wal_receiver_status_interval),
+ 						 errhint("Either increase replication_timeout on the master, or decrease wal_receiver_status_interval on the standby.")));
+ }
+ 
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
  	bool		caughtup = false;
+ 	bool		pending = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	WalSndOutBuffer = palloc(6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
! 	WalSndOutHead = WalSndOutTail = 0;
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
***************
*** 685,690 **** WalSndLoop(void)
--- 807,815 ----
  	 */
  	initStringInfo(&reply_message);
  
+ 	/* Initialize the last reply timestamp */
+ 	last_reply_timestamp = GetCurrentTimestamp();
+ 
  	/* Loop forever, unless we get an error */
  	for (;;)
  	{
***************
*** 700,705 **** WalSndLoop(void)
--- 825,833 ----
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/* Verify whether replication_timeout is large enough */
+ 			ValidateReplicationTimeout(false);
  		}
  
  		/*
***************
*** 708,717 **** WalSndLoop(void)
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup)
  				walsender_shutdown_requested = true;
  		}
  
--- 836,845 ----
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup && !pending)
  				walsender_shutdown_requested = true;
  		}
  
***************
*** 726,735 **** WalSndLoop(void)
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round, nap for the
! 		 * configured time before retrying.
  		 */
! 		if (caughtup)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
--- 854,864 ----
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round or could not
! 		 * flush pending WAL in output buffer because the socket was not
! 		 * writable, nap for the configured time before retrying.
  		 */
! 		if (caughtup || pending)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
***************
*** 740,764 **** WalSndLoop(void)
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  		}
  
--- 869,931 ----
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(&caughtup, &pending))
  				break;
! 			if ((caughtup || pending) && !got_SIGHUP && !walsender_ready_to_stop &&
! 					!walsender_shutdown_requested)
  			{
+ 				TimestampTz	finish_time;
+ 				long		sleeptime;
+ 
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
+ 				/* Reschedule replication timeout */
+ 				if (replication_timeout > 0 &&
+ 						standby_wal_receiver_status_interval > 0)
+ 				{
+ 					long		secs;
+ 					int		usecs;
+ 
+ 					finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+ 											replication_timeout);
+ 					TimestampDifference(GetCurrentTimestamp(),
+ 								finish_time, &secs, &usecs);
+ 					sleeptime = secs * 1000 + usecs / 1000;
+ 					if (WalSndDelay < sleeptime)
+ 						sleeptime = WalSndDelay;
+ 				}
+ 				else
+ 					sleeptime = WalSndDelay;
+ 
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  true, (WalSndOutTail > 0),
! 								  sleeptime * 1000L);
! 
! 				/* Check for replication timeout */
! 				if (replication_timeout > 0 &&
! 						standby_wal_receiver_status_interval > 0 &&
! 						GetCurrentTimestamp() >= finish_time)
! 				{
! 					/*
! 					 * Since typically expiration of replication timeout means
! 					 * communication problem, we don't send the error message
! 					 * to the standby.
! 					 */
! 					ereport(COMMERROR,
! 							(errmsg("terminating walsender process due to replication timeout")));
! 					break;
! 				}
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  		}
  
***************
*** 986,1009 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
-  * msgbuf is a work area in which the output message is constructed.  It's
-  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
-  * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
-  *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(char *msgbuf, bool *caughtup)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	XLogRecPtr	endptr;
  	Size		nbytes;
  	WalDataMessageHeader msghdr;
  
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
--- 1153,1200 ----
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
+  * If there is pending WAL in output buffer, *pending is set to true,
+  * otherwise *pending is set to false.
+  *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(bool *caughtup, bool *pending)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	static XLogRecPtr	endptr;
  	Size		nbytes;
+ 	uint32		n32;
+ 	int			res;
  	WalDataMessageHeader msghdr;
  
+ 	/* Attempt to flush pending WAL in output buffer */
+ 	if (*pending)
+ 	{
+ 		if (WalSndOutHead != WalSndOutTail)
+ 		{
+ 			res = pq_putbytes_if_writable(WalSndOutBuffer + WalSndOutHead,
+ 										  WalSndOutTail - WalSndOutHead);
+ 			if (res == EOF)
+ 				return false;
+ 			WalSndOutHead += res;
+ 			if (WalSndOutHead != WalSndOutTail)
+ 				return true;
+ 		}
+ 
+ 		res = pq_flush_if_writable();
+ 		if (res == EOF)
+ 			return false;
+ 		if (res == 0)
+ 			return true;
+ 
+ 		goto updt;
+ 	}
+ 
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
***************
*** 1072,1084 **** XLogSend(char *msgbuf, bool *caughtup)
  	/*
  	 * OK to read and send the slice.
  	 */
! 	msgbuf[0] = 'w';
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
--- 1263,1281 ----
  	/*
  	 * OK to read and send the slice.
  	 */
! 	WalSndOutBuffer[0] = 'd';
! 	WalSndOutBuffer[5] = 'w';
! 	WalSndOutHead = 0;
! 	WalSndOutTail = 6 + sizeof(WalDataMessageHeader) + nbytes;
! 
! 	n32 = htonl((uint32) WalSndOutTail - 1);
! 	memcpy(WalSndOutBuffer + 1, &n32, 4);
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(WalSndOutBuffer + 6 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
***************
*** 1088,1100 **** XLogSend(char *msgbuf, bool *caughtup)
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
  
! 	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
  
  	/* Flush pending output to the client */
! 	if (pq_flush())
  		return false;
  
  	sentPtr = endptr;
  
--- 1285,1318 ----
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(WalSndOutBuffer + 6, &msghdr, sizeof(WalDataMessageHeader));
! 
! 	res = pq_putbytes_if_writable(WalSndOutBuffer, WalSndOutTail);
! 	if (res == EOF)
! 		return false;
  
! 	WalSndOutHead = res;
! 	if (WalSndOutHead != WalSndOutTail)
! 	{
! 		*caughtup = false;
! 		*pending = true;
! 		return true;
! 	}
  
  	/* Flush pending output to the client */
! 	res = pq_flush_if_writable();
! 	if (res == EOF)
  		return false;
+ 	if (res == 0)
+ 	{
+ 		*caughtup = false;
+ 		*pending = true;
+ 		return true;
+ 	}
+ 
+ updt:
+ 	WalSndOutHead = WalSndOutTail = 0;
+ 	*pending = false;
  
  	sentPtr = endptr;
  
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1847,1852 **** static struct config_int ConfigureNamesInt[] =
--- 1847,1862 ----
  	},
  
  	{
+ 		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum time to wait for WAL replication."),
+ 			NULL,
+ 			GUC_UNIT_MS
+ 		},
+ 		&replication_timeout,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 191,196 ****
--- 191,197 ----
  #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
  #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+ #replication_timeout = 0 # in milliseconds, 0 is disabled
  
  # - Standby Servers -
  
*** a/src/include/libpq/libpq.h
--- b/src/include/libpq/libpq.h
***************
*** 59,65 **** extern int	pq_getbyte(void);
--- 59,67 ----
  extern int	pq_peekbyte(void);
  extern int	pq_getbyte_if_available(unsigned char *c);
  extern int	pq_putbytes(const char *s, size_t len);
+ extern int	pq_putbytes_if_writable(const char *s, size_t len);
  extern int	pq_flush(void);
+ extern int	pq_flush_if_writable(void);
  extern int	pq_putmessage(char msgtype, const char *s, size_t len);
  extern void pq_startcopyout(void);
  extern void pq_endcopyout(bool errorAbort);
*** a/src/include/replication/walprotocol.h
--- b/src/include/replication/walprotocol.h
***************
*** 81,86 **** typedef struct
--- 81,104 ----
  } StandbyHSFeedbackMessage;
  
  /*
+  * GUC parameter change report from standby (message type 'g').  This is wrapped
+  * within a CopyData message at the FE/BE protocol level.
+  *
+  * Note that the data length is not specified here.
+  */
+ typedef struct
+ {
+ 	/*
+ 	 * Only change of important parameters for streaming replication needs
+ 	 * to be reported.
+ 	 */
+ 	int			wal_receiver_status_interval;
+ 
+ 	/* Sender's system clock at the time of transmission */
+ 	TimestampTz sendTime;
+ } StandbyGUCChangeMessage;
+ 
+ /*
   * Maximum data payload in a WAL data message.	Must be >= XLOG_BLCKSZ.
   *
   * We don't have a good idea of what a good value would be; there's some
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 70,75 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 70,76 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 40,46 **** extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
--- 40,46 ----
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#25Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#24)
1 attachment(s)
Re: Replication server timeout patch

On Sun, Mar 6, 2011 at 11:10 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Mar 6, 2011 at 5:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does internal_flush_if_writable compute bufptr differently from
internal_flush?  And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code.  Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state.  You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end.  Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

All these comments seem to make sense. Will fix. Thanks!

Done. I attached the updated patch.

I rebased the patch against current git master.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_timeout_v5.patchapplication/octet-stream; name=replication_timeout_v5.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2015,2020 **** SET ENABLE_SEQSCAN TO OFF;
--- 2015,2044 ----
         </para>
        </listitem>
       </varlistentry>
+ 
+      <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+       <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the maximum time, in milliseconds, to wait for the reply
+         from the standby before terminating replication.  This is useful for
+         the primary server to detect the standby crash or network outage.
+         A value of zero (the default) turns this off.  This parameter can
+         only be set in the <filename>postgresql.conf</> file or on the server
+         command line.
+        </para>
+        <para>
+         To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+         must be enabled on the standby, and its value must be less than the
+         value of <varname>replication_timeout</>.
+         If <varname>wal_receiver_status_interval</> is zero on the standby,
+         replication timeout is disabled for that connection.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
  
***************
*** 2211,2216 **** SET ENABLE_SEQSCAN TO OFF;
--- 2235,2245 ----
         the <filename>postgresql.conf</> file or on the server command line.
         The default value is 10 seconds.
        </para>
+       <para>
+        When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+        <varname>wal_receiver_status_interval</> must be enabled, and its value
+        must be less than the value of <varname>replication_timeout</>.
+       </para>
        </listitem>
       </varlistentry>
  
*** a/doc/src/sgml/protocol.sgml
--- b/doc/src/sgml/protocol.sgml
***************
*** 1480,1485 **** The commands accepted in walsender mode are:
--- 1480,1533 ----
        <variablelist>
        <varlistentry>
        <term>
+           Standby parameter change (F)
+       </term>
+       <listitem>
+       <para>
+       <variablelist>
+       <varlistentry>
+       <term>
+           Byte1('g')
+       </term>
+       <listitem>
+       <para>
+           Identifies the message as a receiver parameter change.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Int32
+       </term>
+       <listitem>
+       <para>
+           The current value of wal_receiver_status_interval
+           in the standby.
+       </para>
+       </listitem>
+       </varlistentry>
+       <varlistentry>
+       <term>
+           Byte8
+       </term>
+       <listitem>
+       <para>
+           The server's system clock at the time of transmission,
+           given in TimestampTz format.
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+       </para>
+       </listitem>
+       </varlistentry>
+       </variablelist>
+      </para>
+ 
+      <para>
+       <variablelist>
+       <varlistentry>
+       <term>
            Standby status update (F)
        </term>
        <listitem>
*** a/src/backend/libpq/pqcomm.c
--- b/src/backend/libpq/pqcomm.c
***************
*** 56,61 ****
--- 56,64 ----
   *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
   *		pq_flush		- flush pending output
   *		pq_getbyte_if_available - get a byte if available without blocking
+  *		pq_putbytes_if_writable	- send bytes to connection if writable without blocking
+  *		pq_flush_if_writable	- flush pending output if writable without blocking
+  *		pq_set_nonblocking	- set socket blocking/non-blocking
   *
   * message-level I/O (and old-style-COPY-OUT cruft):
   *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
***************
*** 112,117 **** static char sock_path[MAXPGPATH];
--- 115,121 ----
  
  static char PqSendBuffer[PQ_BUFFER_SIZE];
  static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+ static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
  
  static char PqRecvBuffer[PQ_BUFFER_SIZE];
  static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
***************
*** 127,133 **** static bool DoingCopyOut;
  /* Internal functions */
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
! static int	internal_flush(void);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
--- 131,138 ----
  /* Internal functions */
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
! static int	internal_flush(bool nonblocking);
! static void pq_set_nonblocking(bool nonblocking, int emode);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
***************
*** 142,148 **** static int	Setup_AF_UNIX(void);
  void
  pq_init(void)
  {
! 	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
--- 147,153 ----
  void
  pq_init(void)
  {
! 	PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
***************
*** 846,859 **** pq_getbyte_if_available(unsigned char *c)
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! #ifdef WIN32
! 	pgwin32_noblock = 1;
! #else
! 	if (!pg_set_noblock(MyProcPort->sock))
! 		ereport(ERROR,
! 				(errmsg("could not set socket to non-blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = true;
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
--- 851,857 ----
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! 	pq_set_nonblocking(true, ERROR);
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
***************
*** 892,916 **** pq_getbyte_if_available(unsigned char *c)
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! #ifdef WIN32
! 		pgwin32_noblock = 0;
! #else
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(FATAL,
! 					(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 		MyProcPort->noblock = false;
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! #ifdef WIN32
! 	pgwin32_noblock = 0;
! #else
! 	if (!pg_set_block(MyProcPort->sock))
! 		ereport(FATAL,
! 				(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = false;
  
  	return r;
  }
--- 890,900 ----
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! 		pq_set_nonblocking(false, FATAL);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! 	pq_set_nonblocking(false, FATAL);
  
  	return r;
  }
***************
*** 1139,1145 **** internal_putbytes(const char *s, size_t len)
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 			if (internal_flush())
  				return EOF;
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
--- 1123,1129 ----
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 			if (internal_flush(false) == EOF)
  				return EOF;
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
***************
*** 1153,1158 **** internal_putbytes(const char *s, size_t len)
--- 1137,1192 ----
  }
  
  /* --------------------------------
+  *		pq_putbytes_if_writable - send bytes to connection (not flushed
+  *			until pq_flush), if writable
+  *
+  * Returns the number of bytes written without blocking, or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_putbytes_if_writable(const char *s, size_t len)
+ {
+ 	size_t		amount;
+ 	size_t		nwritten = 0;
+ 
+ 	/* Should not be called by old-style COPY OUT */
+ 	Assert(!DoingCopyOut);
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 
+ 	while (len > 0)
+ 	{
+ 		/* If buffer is full, then flush it out */
+ 		if (PqSendPointer >= PQ_BUFFER_SIZE)
+ 		{
+ 			int		r;
+ 
+ 			r = internal_flush(true);
+ 			if (r == 0)
+ 				break;
+ 			if (r == EOF)
+ 			{
+ 				PqCommBusy = false;
+ 				return r;
+ 			}
+ 		}
+ 		amount = PQ_BUFFER_SIZE - PqSendPointer;
+ 		if (amount > len)
+ 			amount = len;
+ 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
+ 		PqSendPointer += amount;
+ 		s += amount;
+ 		len -= amount;
+ 		nwritten += amount;
+ 	}
+ 
+ 	PqCommBusy = false;
+ 	return (int) nwritten;
+ }
+ 
+ /* --------------------------------
   *		pq_flush		- flush pending output
   *
   *		returns 0 if OK, EOF if trouble
***************
*** 1167,1227 **** pq_flush(void)
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush();
  	PqCommBusy = false;
! 	return res;
  }
  
  static int
! internal_flush(void)
  {
  	static int	last_reported_send_errno = 0;
  
! 	char	   *bufptr = PqSendBuffer;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	while (bufptr < bufend)
  	{
! 		int			r;
! 
! 		r = secure_write(MyProcPort, bufptr, bufend - bufptr);
! 
! 		if (r <= 0)
  		{
! 			if (errno == EINTR)
! 				continue;		/* Ok if we were interrupted */
  
! 			/*
! 			 * Careful: an ereport() that tries to write to the client would
! 			 * cause recursion to here, leading to stack overflow and core
! 			 * dump!  This message must go *only* to the postmaster log.
! 			 *
! 			 * If a client disconnects while we're in the midst of output, we
! 			 * might write quite a bit of data before we get to a safe query
! 			 * abort point.  So, suppress duplicate log messages.
! 			 */
! 			if (errno != last_reported_send_errno)
  			{
! 				last_reported_send_errno = errno;
! 				ereport(COMMERROR,
! 						(errcode_for_socket_access(),
! 						 errmsg("could not send data to client: %m")));
  			}
  
! 			/*
! 			 * We drop the buffered data anyway so that processing can
! 			 * continue, even though we'll probably quit soon.
! 			 */
! 			PqSendPointer = 0;
! 			return EOF;
  		}
- 
- 		last_reported_send_errno = 0;	/* reset after any successful send */
- 		bufptr += r;
  	}
  
! 	PqSendPointer = 0;
! 	return 0;
  }
  
  
--- 1201,1362 ----
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush(false);
  	PqCommBusy = false;
! 	return (res == 1) ? 0 : EOF;
  }
  
+ /* --------------------------------
+  *		internal_flush - flush pending output
+  *
+  * Returns 1 if OK, 0 if pending output cannot be written without blocking
+  * (only possible nonblocking is true), or EOF if trouble.
+  * --------------------------------
+  */
  static int
! internal_flush(bool nonblocking)
  {
  	static int	last_reported_send_errno = 0;
+ 	int		r;
  
! 	char	   *bufptr = PqSendBuffer + PqSendStart;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	/* Temporarily put the socket into non-blocking mode */
! 	if (nonblocking)
! 		pq_set_nonblocking(true, ERROR);
! 	PG_TRY();
  	{
! 		while (bufptr < bufend)
  		{
! 			r = secure_write(MyProcPort, bufptr, bufend - bufptr);
  
! 			if (r <= 0)
  			{
! 				/* Ok if we were interrupted in blocking mode */
! 				if (!nonblocking && errno == EINTR)
! 					continue;
! 
! 				if (nonblocking)
! 				{
! 					if (r == 0)
! 						r = EOF;	/* EOF detected */
! 					else if (errno == EAGAIN ||
! 							 errno == EWOULDBLOCK ||
! 							 errno == EINTR)
! 					{
! 						/*
! 						 * Ok if no data writable without blocking or
! 						 * interrupted (though EINTR really shouldn't
! 						 * happen with a non-blocking socket). Report
! 						 * other errors.
! 						 */
! 						r = 0;
! 					}
! 					break;
! 				}
! 
! 				/*
! 				 * Careful: an ereport() that tries to write to the
! 				 * client would cause recursion to here, leading to
! 				 * stack overflow and core dump!  This message must
! 				 * go *only* to the postmaster log.
! 				 *
! 				 * If a client disconnects while we're in the midst
! 				 * of output, we might write quite a bit of data before
! 				 * we get to a safe query abort point.  So, suppress
! 				 * duplicate log messages.
! 				 */
! 				if (errno != last_reported_send_errno)
! 				{
! 					last_reported_send_errno = errno;
! 					ereport(COMMERROR,
! 							(errcode_for_socket_access(),
! 							 errmsg("could not send data to client: %m")));
! 				}
! 
! 				/*
! 				 * We drop the buffered data anyway so that processing can
! 				 * continue, even though we'll probably quit soon.
! 				 */
! 				PqSendStart = PqSendPointer = 0;
! 				r = EOF;
! 				break;
  			}
  
! 			last_reported_send_errno = 0;	/* reset after any successful send */
! 			bufptr += r;
! 			PqSendStart += r;
  		}
  	}
+ 	PG_CATCH();
+ 	{
+ 		/*
+ 		 * The rest of the backend code assumes the socket is in blocking
+ 		 * mode, so treat failure as FATAL.
+ 		 */
+ 		if (nonblocking)
+ 			pq_set_nonblocking(false, FATAL);
+ 	}
+ 	PG_END_TRY();
+ 	if (nonblocking)
+ 		pq_set_nonblocking(false, FATAL);
  
! 	if (r == 0 || r == EOF)
! 		return r;
! 
! 	PqSendStart = PqSendPointer = 0;
! 	return 1;
! }
! 
! /* --------------------------------
!  *		pq_flush_if_writable - flush pending output if writable
!  *
!  * Returns 1 if OK, 0 if pending output cannot be written without blocking,
!  * or EOF if trouble.
!  * --------------------------------
!  */
! int
! pq_flush_if_writable(void)
! {
! 	int			res;
! 
! 	/* No-op if reentrant call */
! 	if (PqCommBusy)
! 		return 0;
! 	PqCommBusy = true;
! 	res = internal_flush(true);
! 	PqCommBusy = false;
! 	return res;
! }
! 
! /* --------------------------------
!  *		pq_set_nonblocking - set socket blocking/non-blocking
!  *
!  * Sets the socket non-blocking if nonblocking is TRUE, or sets it
!  * blocking otherwise.
!  * --------------------------------
!  */
! static
! void pq_set_nonblocking(bool nonblocking, int emode)
! {
! #ifdef WIN32
! 	pgwin32_noblock = nonblocking ? 1 : 0;
! #else
! 	if (nonblocking)
! 	{
! 		if (!pg_set_noblock(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to non-blocking mode: %m")));
! 	}
! 	else
! 	{
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to blocking mode: %m")));
! 	}
! #endif
! 	MyProcPort->noblock = nonblocking;
  }
  
  
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 193,211 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
!  * was set, or 2 if the scoket became readable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
  	int			rc;
  	int			result = 0;
  
--- 193,214 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the scoket became readable, or 3 if
!  * the socket became writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
+ 	fd_set		output_mask;
  	int			rc;
  	int			result = 0;
  
***************
*** 241,254 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 244,265 ----
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
! 		{
! 			FD_SET(sock, &output_mask);
! 			if (sock > hifd)
! 				hifd = sock;
! 		}
! 
! 		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 263,273 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
  	}
  	waiting = false;
  
--- 274,291 ----
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && forRead &&
! 			FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
  		}
+ 		if (sock != PGINVALID_SOCKET && forWrite &&
+ 			FD_ISSET(sock, &output_mask))
+ 		{
+ 			result = 3;
+ 			break;		/* data writable in socket */
+ 		}
  	}
  	waiting = false;
  
*** a/src/backend/port/win32/socket.c
--- b/src/backend/port/win32/socket.c
***************
*** 14,20 ****
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() should operate in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
--- 14,21 ----
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() and pgwin32_send() should operate
!  * in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
***************
*** 399,404 **** pgwin32_send(SOCKET s, char *buf, int len, int flags)
--- 400,415 ----
  			return -1;
  		}
  
+ 		if (pgwin32_noblock)
+ 		{
+ 			/*
+ 			 * No data sent, and we are in "emulated non-blocking mode", so
+ 			 * return indicating that we'd block if we were to continue.
+ 			 */
+ 			errno = EWOULDBLOCK;
+ 			return -1;
+ 		}
+ 
  		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
  
  		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 85,95 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
--- 85,96 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
***************
*** 103,112 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET)
  	{
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, FD_READ);
  		events[numevents++] = sockevent;
  	}
  
--- 104,120 ----
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
+ 		int		flags = 0;
+ 
+ 		if (forRead)
+ 			flags |= FD_READ;
+ 		if (forWrite)
+ 			flags |= FD_WRITE;
+ 
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
***************
*** 139,146 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			Assert(sock != PGINVALID_SOCKET);
! 			result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
--- 147,165 ----
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
+ 			WSANETWORKEVENTS resEvents;
+ 
  			Assert(sock != PGINVALID_SOCKET);
! 
! 			ZeroMemory(&resEvents, sizeof(resEvents));
! 			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
! 				ereport(FATAL,
! 						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
! 
! 			if (forRead && resEvents.lNetworkEvents & FD_READ)
! 				result = 2;
! 			if (forWrite && resEvents.lNetworkEvents & FD_WRITE)
! 				result = 3;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
***************
*** 148,154 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	}
  
  	/* Clean up the handle we created for the socket */
! 		if (sock != PGINVALID_SOCKET)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 167,173 ----
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/replication/walreceiver.c
--- b/src/backend/replication/walreceiver.c
***************
*** 125,130 **** static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr);
--- 125,131 ----
  static void XLogWalRcvFlush(bool dying);
  static void XLogWalRcvSendReply(void);
  static void XLogWalRcvSendHSFeedback(void);
+ static void XLogWalRcvSendGUCChange(void);
  
  /* Signal handlers */
  static void WalRcvSigHupHandler(SIGNAL_ARGS);
***************
*** 276,281 **** WalReceiverMain(void)
--- 277,288 ----
  	walrcv_connect(conninfo, startpoint);
  	DisableWalRcvImmediateExit();
  
+ 	/*
+ 	 * Report the important parameters for streaming replication to
+ 	 * the primary.
+ 	 */
+ 	XLogWalRcvSendGUCChange();
+ 
  	/* Loop until end-of-streaming or error */
  	for (;;)
  	{
***************
*** 303,310 **** WalReceiverMain(void)
--- 310,325 ----
  
  		if (got_SIGHUP)
  		{
+ 			int	save_wal_receiver_status_interval =
+ 				wal_receiver_status_interval;
+ 
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/* If any of important parameters have changed, report them */
+ 			if (save_wal_receiver_status_interval !=
+ 					wal_receiver_status_interval)
+ 				XLogWalRcvSendGUCChange();
  		}
  
  		/* Wait a while for data to arrive */
***************
*** 704,706 **** XLogWalRcvSendHSFeedback(void)
--- 719,748 ----
  	memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage));
  	walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
  }
+ 
+ /*
+  * Send parameter change message to primary, plus the current time.
+  */
+ static void
+ XLogWalRcvSendGUCChange(void)
+ {
+ 	char			buf[sizeof(StandbyGUCChangeMessage) + 1];
+ 	TimestampTz		now;
+ 	StandbyGUCChangeMessage	guc_change_message;
+ 
+ 	/* Get current timestamp. */
+ 	now = GetCurrentTimestamp();
+ 
+ 	/* Construct a new message */
+ 	guc_change_message.wal_receiver_status_interval =
+ 		wal_receiver_status_interval;
+ 	guc_change_message.sendTime = now;
+ 
+ 	elog(DEBUG2, "sending parameter change wal_receiver_status_interval %d",
+ 			 wal_receiver_status_interval);
+ 
+ 	/* Prepend with the message type and send it. */
+ 	buf[0] = 'g';
+ 	memcpy(&buf[1], &guc_change_message, sizeof(StandbyGUCChangeMessage));
+ 	walrcv_send(buf, sizeof(StandbyGUCChangeMessage) + 1);
+ }
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 74,79 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 74,91 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 1000;	/* max sleep time between some actions */
+ int			replication_timeout = 0;	/* maximum time to send one WAL data message */
+ 
+ /*
+  * Buffer for WAL sending
+  *
+  * WalSndOutBuffer is a work area in which the output message is constructed.
+  * It's used in just so we can avoid re-palloc'ing the buffer on each cycle.
+  * It must be of size 6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
+  */
+ static char	   *WalSndOutBuffer;
+ static int		WalSndOutHead;		/* head of pending output */
+ static int		WalSndOutTail;		/* tail of pending output */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 95,100 **** static XLogRecPtr sentPtr = {0, 0};
--- 107,123 ----
   */
  static StringInfoData reply_message;
  
+ /*
+  * Timestamp of the last receipt of the reply from the standby.
+  */
+ static TimestampTz last_reply_timestamp;
+ 
+ /*
+  * The value of wal_receiver_status_interval on the standby.
+  * If this is zero, we disable replication timeout.
+  */
+ static int	standby_wal_receiver_status_interval = 10;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 113,124 **** static int	WalSndLoop(void);
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(char *msgbuf, bool *caughtup);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
  static void ProcessRepliesIfAny(void);
  
  
--- 136,149 ----
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(bool *caughtup, bool *pending);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
  static void ProcessStandbyReplyMessage(void);
  static void ProcessStandbyHSFeedbackMessage(void);
+ static void ProcessStandbyGUCChangeMessage(void);
+ static void ValidateReplicationTimeout(bool standby);
  static void ProcessRepliesIfAny(void);
  
  
***************
*** 216,221 **** WalSndHandshake(void)
--- 241,252 ----
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
+ 
+ 			/*
+ 			 * Don't need to verify replication_timeout here because we
+ 			 * don't receive any parameter change message until we enter
+ 			 * streaming mode.
+ 			 */
  		}
  
  		if (firstchar != EOF)
***************
*** 469,474 **** ProcessRepliesIfAny(void)
--- 500,506 ----
  {
  	unsigned char firstchar;
  	int			r;
+ 	int		received = false;
  
  	for (;;)
  	{
***************
*** 481,489 **** ProcessRepliesIfAny(void)
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)
  		{
! 			/* no data available without blocking */
  			return;
  		}
  
--- 513,526 ----
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)	/* no data available without blocking */
  		{
! 			/*
! 			 * Save the last reply timestamp if we've received at least
! 			 * one reply.
! 			 */
! 			if (received)
! 				last_reply_timestamp = GetCurrentTimestamp();
  			return;
  		}
  
***************
*** 495,500 **** ProcessRepliesIfAny(void)
--- 532,538 ----
  				 */
  			case 'd':
  				ProcessStandbyMessage();
+ 				received = true;
  				break;
  
  				/*
***************
*** 513,519 **** ProcessRepliesIfAny(void)
  }
  
  /*
!  * Process a status update message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
--- 551,557 ----
  }
  
  /*
!  * Process a message received from standby.
   */
  static void
  ProcessStandbyMessage(void)
***************
*** 549,554 **** ProcessStandbyMessage(void)
--- 587,596 ----
  			ProcessStandbyHSFeedbackMessage();
  			break;
  
+ 		case 'g':
+ 			ProcessStandbyGUCChangeMessage();
+ 			break;
+ 
  		default:
  			ereport(COMMERROR,
  					(errcode(ERRCODE_PROTOCOL_VIOLATION),
***************
*** 669,687 **** ProcessStandbyHSFeedbackMessage(void)
  	}
  }
  
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
- 	char	   *output_message;
  	bool		caughtup = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
--- 711,809 ----
  	}
  }
  
+ /*
+  * Parameter change report from standby
+  */
+ static void
+ ProcessStandbyGUCChangeMessage(void)
+ {
+ 	StandbyGUCChangeMessage	reply;
+ 
+ 	pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyGUCChangeMessage));
+ 
+ 	elog(DEBUG2, "standby's wal_receiver_status_interval %d",
+ 			 reply.wal_receiver_status_interval);
+ 
+ 	/* If we don't use replication timeout, we don't need to do any more */
+ 	if (replication_timeout <= 0)
+ 		return;
+ 
+ 	/*
+ 	 * If wal_receiver_status_interval is zero in the standby,
+ 	 * we ignore replication timeout for that connection.
+ 	 */
+ 	if (standby_wal_receiver_status_interval > 0 &&
+ 			reply.wal_receiver_status_interval <= 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is disabled because wal_receiver_status_interval is zero on the standby")));
+ 	}
+ 	else if (standby_wal_receiver_status_interval <= 0 &&
+ 					 reply.wal_receiver_status_interval > 0)
+ 	{
+ 		ereport(LOG,
+ 						(errmsg("replication timeout is enabled because wal_receiver_status_interval is greater than zero on the standby")));
+ 	}
+ 
+ 	standby_wal_receiver_status_interval = reply.wal_receiver_status_interval;
+ 
+ 	/* Verify whether replication_timeout is large enough */
+ 	ValidateReplicationTimeout(true);
+ }
+ 
+ /*
+  * Verify whether replication_timeout is large enough for the timeout.
+  *
+  * This is called when either replication_timeout is changed on the
+  * master or wal_receiver_status_interval is changed on the standby.
+  * In the former case, 'standby' must be set to FALSE, otherwise TRUE.
+  */
+ static void
+ ValidateReplicationTimeout(bool standby)
+ {
+ 	static bool	skip_validation = true;
+ 
+ 	/*
+ 	 * If we've not received any parameter change message from the standby
+ 	 * yet, we cannot verify replication_timeout properly. In that case,
+ 	 * we postpone the validation until the message has arrived. This can
+ 	 * happen only when replication_timeout is changed by SIGHUP signal
+ 	 * before any parameter change message arrives.
+ 	 */
+ 	if (standby)
+ 		skip_validation = false;
+ 
+ 	if (skip_validation)
+ 		return;
+ 
+ 	/*
+ 	 * Emit WARNING message if replication_timeout in the primary
+ 	 * is less than wal_receiver_status_interval in the standby
+ 	 * because unexpected timeout can happen in that case.
+ 	 */
+ 	if (replication_timeout > 0 &&
+ 			standby_wal_receiver_status_interval > 0 &&
+ 			replication_timeout <= standby_wal_receiver_status_interval * 1000)
+ 		ereport(WARNING,
+ 						(errmsg("replication can be terminated unexpectedly because replication_timeout (%d milliseconds) on the master is less than wal_receiver_status_interval (%d seconds) on the standby",
+ 										replication_timeout, standby_wal_receiver_status_interval),
+ 						 errhint("Either increase replication_timeout on the master, or decrease wal_receiver_status_interval on the standby.")));
+ }
+ 
  /* Main loop of walsender process */
  static int
  WalSndLoop(void)
  {
  	bool		caughtup = false;
+ 	bool		pending = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	WalSndOutBuffer = palloc(6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
! 	WalSndOutHead = WalSndOutTail = 0;
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
***************
*** 689,694 **** WalSndLoop(void)
--- 811,819 ----
  	 */
  	initStringInfo(&reply_message);
  
+ 	/* Initialize the last reply timestamp */
+ 	last_reply_timestamp = GetCurrentTimestamp();
+ 
  	/* Loop forever, unless we get an error */
  	for (;;)
  	{
***************
*** 705,710 **** WalSndLoop(void)
--- 830,838 ----
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  			SyncRepInitConfig();
+ 
+ 			/* Verify whether replication_timeout is large enough */
+ 			ValidateReplicationTimeout(false);
  		}
  
  		/*
***************
*** 713,722 **** WalSndLoop(void)
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup)
  				walsender_shutdown_requested = true;
  		}
  
--- 841,850 ----
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup && !pending)
  				walsender_shutdown_requested = true;
  		}
  
***************
*** 731,740 **** WalSndLoop(void)
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round, nap for the
! 		 * configured time before retrying.
  		 */
! 		if (caughtup)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
--- 859,869 ----
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round or could not
! 		 * flush pending WAL in output buffer because the socket was not
! 		 * writable, nap for the configured time before retrying.
  		 */
! 		if (caughtup || pending)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
***************
*** 745,769 **** WalSndLoop(void)
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  		}
  
--- 874,936 ----
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(&caughtup, &pending))
  				break;
! 			if ((caughtup || pending) && !got_SIGHUP && !walsender_ready_to_stop &&
! 					!walsender_shutdown_requested)
  			{
+ 				TimestampTz	finish_time;
+ 				long		sleeptime;
+ 
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
+ 				/* Reschedule replication timeout */
+ 				if (replication_timeout > 0 &&
+ 						standby_wal_receiver_status_interval > 0)
+ 				{
+ 					long		secs;
+ 					int		usecs;
+ 
+ 					finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+ 											replication_timeout);
+ 					TimestampDifference(GetCurrentTimestamp(),
+ 								finish_time, &secs, &usecs);
+ 					sleeptime = secs * 1000 + usecs / 1000;
+ 					if (WalSndDelay < sleeptime)
+ 						sleeptime = WalSndDelay;
+ 				}
+ 				else
+ 					sleeptime = WalSndDelay;
+ 
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  true, (WalSndOutTail > 0),
! 								  sleeptime * 1000L);
! 
! 				/* Check for replication timeout */
! 				if (replication_timeout > 0 &&
! 						standby_wal_receiver_status_interval > 0 &&
! 						GetCurrentTimestamp() >= finish_time)
! 				{
! 					/*
! 					 * Since typically expiration of replication timeout means
! 					 * communication problem, we don't send the error message
! 					 * to the standby.
! 					 */
! 					ereport(COMMERROR,
! 							(errmsg("terminating walsender process due to replication timeout")));
! 					break;
! 				}
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  		}
  
***************
*** 996,1019 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
-  * msgbuf is a work area in which the output message is constructed.  It's
-  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
-  * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
-  *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(char *msgbuf, bool *caughtup)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	XLogRecPtr	endptr;
  	Size		nbytes;
  	WalDataMessageHeader msghdr;
  
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
--- 1163,1210 ----
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
+  * If there is pending WAL in output buffer, *pending is set to true,
+  * otherwise *pending is set to false.
+  *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(bool *caughtup, bool *pending)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	static XLogRecPtr	endptr;
  	Size		nbytes;
+ 	uint32		n32;
+ 	int			res;
  	WalDataMessageHeader msghdr;
  
+ 	/* Attempt to flush pending WAL in output buffer */
+ 	if (*pending)
+ 	{
+ 		if (WalSndOutHead != WalSndOutTail)
+ 		{
+ 			res = pq_putbytes_if_writable(WalSndOutBuffer + WalSndOutHead,
+ 										  WalSndOutTail - WalSndOutHead);
+ 			if (res == EOF)
+ 				return false;
+ 			WalSndOutHead += res;
+ 			if (WalSndOutHead != WalSndOutTail)
+ 				return true;
+ 		}
+ 
+ 		res = pq_flush_if_writable();
+ 		if (res == EOF)
+ 			return false;
+ 		if (res == 0)
+ 			return true;
+ 
+ 		goto updt;
+ 	}
+ 
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
***************
*** 1082,1094 **** XLogSend(char *msgbuf, bool *caughtup)
  	/*
  	 * OK to read and send the slice.
  	 */
! 	msgbuf[0] = 'w';
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
--- 1273,1291 ----
  	/*
  	 * OK to read and send the slice.
  	 */
! 	WalSndOutBuffer[0] = 'd';
! 	WalSndOutBuffer[5] = 'w';
! 	WalSndOutHead = 0;
! 	WalSndOutTail = 6 + sizeof(WalDataMessageHeader) + nbytes;
! 
! 	n32 = htonl((uint32) WalSndOutTail - 1);
! 	memcpy(WalSndOutBuffer + 1, &n32, 4);
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(WalSndOutBuffer + 6 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
***************
*** 1098,1110 **** XLogSend(char *msgbuf, bool *caughtup)
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
  
! 	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
  
  	/* Flush pending output to the client */
! 	if (pq_flush())
  		return false;
  
  	sentPtr = endptr;
  
--- 1295,1328 ----
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(WalSndOutBuffer + 6, &msghdr, sizeof(WalDataMessageHeader));
! 
! 	res = pq_putbytes_if_writable(WalSndOutBuffer, WalSndOutTail);
! 	if (res == EOF)
! 		return false;
  
! 	WalSndOutHead = res;
! 	if (WalSndOutHead != WalSndOutTail)
! 	{
! 		*caughtup = false;
! 		*pending = true;
! 		return true;
! 	}
  
  	/* Flush pending output to the client */
! 	res = pq_flush_if_writable();
! 	if (res == EOF)
  		return false;
+ 	if (res == 0)
+ 	{
+ 		*caughtup = false;
+ 		*pending = true;
+ 		return true;
+ 	}
+ 
+ updt:
+ 	WalSndOutHead = WalSndOutTail = 0;
+ 	*pending = false;
  
  	sentPtr = endptr;
  
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1856,1861 **** static struct config_int ConfigureNamesInt[] =
--- 1856,1871 ----
  	},
  
  	{
+ 		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum time to wait for WAL replication."),
+ 			NULL,
+ 			GUC_UNIT_MS
+ 		},
+ 		&replication_timeout,
+ 		0, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 200,205 ****
--- 200,206 ----
  #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
  #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+ #replication_timeout = 0 # in milliseconds, 0 is disabled
  
  # - Standby Servers -
  
*** a/src/include/libpq/libpq.h
--- b/src/include/libpq/libpq.h
***************
*** 59,65 **** extern int	pq_getbyte(void);
--- 59,67 ----
  extern int	pq_peekbyte(void);
  extern int	pq_getbyte_if_available(unsigned char *c);
  extern int	pq_putbytes(const char *s, size_t len);
+ extern int	pq_putbytes_if_writable(const char *s, size_t len);
  extern int	pq_flush(void);
+ extern int	pq_flush_if_writable(void);
  extern int	pq_putmessage(char msgtype, const char *s, size_t len);
  extern void pq_startcopyout(void);
  extern void pq_endcopyout(bool errorAbort);
*** a/src/include/replication/walprotocol.h
--- b/src/include/replication/walprotocol.h
***************
*** 81,86 **** typedef struct
--- 81,104 ----
  } StandbyHSFeedbackMessage;
  
  /*
+  * GUC parameter change report from standby (message type 'g').  This is wrapped
+  * within a CopyData message at the FE/BE protocol level.
+  *
+  * Note that the data length is not specified here.
+  */
+ typedef struct
+ {
+ 	/*
+ 	 * Only change of important parameters for streaming replication needs
+ 	 * to be reported.
+ 	 */
+ 	int			wal_receiver_status_interval;
+ 
+ 	/* Sender's system clock at the time of transmission */
+ 	TimestampTz sendTime;
+ } StandbyGUCChangeMessage;
+ 
+ /*
   * Maximum data payload in a WAL data message.	Must be >= XLOG_BLCKSZ.
   *
   * We don't have a good idea of what a good value would be; there's some
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 92,97 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 92,98 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 40,46 **** extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
--- 40,46 ----
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#26Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#25)
Re: Replication server timeout patch

On Mon, Mar 7, 2011 at 8:47 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Mar 6, 2011 at 11:10 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Mar 6, 2011 at 5:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does internal_flush_if_writable compute bufptr differently from
internal_flush?  And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code.  Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state.  You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end.  Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

All these comments seem to make sense. Will fix. Thanks!

Done. I attached the updated patch.

I rebased the patch against current git master.

I added this replication timeout patch into next CF.

I explain why this feature is required for the future review;

Without this feature, walsender might unexpectedly remain for a while when
the standby crashes or the network outage happens. TCP keepalive can
improve this situation to a certain extent, but it's not perfect. Remaining
walsender can cause some problems.

For example, when hot_standby_feedback is enabled, such a remaining
walsender would prevent oldest xmin from advancing and interfere with
vacuuming on the master. For example, when you use synchronous
replication and walsender in SYNC mode gets stuck, any synchronous
standby candidate cannot switch to SYNC mode until that walsender exits,
and all the transactions would pause.

This feature causes walsender to exit when there is no reply from the
standby before the replication timeout expires. Then we can avoid the
above problems.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#27Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#26)
Re: Replication server timeout patch

On Fri, Mar 11, 2011 at 8:14 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Mar 7, 2011 at 8:47 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Mar 6, 2011 at 11:10 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Sun, Mar 6, 2011 at 5:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does internal_flush_if_writable compute bufptr differently from
internal_flush?  And shouldn't it be static?

It seems to me that this ought to be refactored so that you don't
duplicate so much code.  Maybe static int internal_flush(bool
nonblocking).

I don't think that the while (bufptr < bufend) loop needs to contain
the code to set and clear the nonblocking state.  You could do the
whole loop with nonblocking mode turned on and then reenable it just
once at the end.  Besides possibly being clearer, that would be more
efficient and leave less room for unexpected failures.

All these comments seem to make sense. Will fix. Thanks!

Done. I attached the updated patch.

I rebased the patch against current git master.

I added this replication timeout patch into next CF.

I explain why this feature is required for the future review;

Without this feature, walsender might unexpectedly remain for a while when
the standby crashes or the network outage happens. TCP keepalive can
improve this situation to a certain extent, but it's not perfect. Remaining
walsender can cause some problems.

For example, when hot_standby_feedback is enabled, such a remaining
walsender would prevent oldest xmin from advancing and interfere with
vacuuming on the master. For example, when you use synchronous
replication and walsender in SYNC mode gets stuck, any synchronous
standby candidate cannot switch to SYNC mode until that walsender exits,
and all the transactions would pause.

This feature causes walsender to exit when there is no reply from the
standby before the replication timeout expires. Then we can avoid the
above problems.

I think we should consider making this change for 9.1. This is a real
wart, and it's going to become even more of a problem with sync rep, I
think.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#28Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#27)
Re: Replication server timeout patch

On Fri, Mar 11, 2011 at 10:18 PM, Robert Haas <robertmhaas@gmail.com> wrote:

I added this replication timeout patch into next CF.

I explain why this feature is required for the future review;

Without this feature, walsender might unexpectedly remain for a while when
the standby crashes or the network outage happens. TCP keepalive can
improve this situation to a certain extent, but it's not perfect. Remaining
walsender can cause some problems.

For example, when hot_standby_feedback is enabled, such a remaining
walsender would prevent oldest xmin from advancing and interfere with
vacuuming on the master. For example, when you use synchronous
replication and walsender in SYNC mode gets stuck, any synchronous
standby candidate cannot switch to SYNC mode until that walsender exits,
and all the transactions would pause.

This feature causes walsender to exit when there is no reply from the
standby before the replication timeout expires. Then we can avoid the
above problems.

I think we should consider making this change for 9.1.  This is a real
wart, and it's going to become even more of a problem with sync rep, I
think.

Yeah, that's a welcome! Please feel free to review the patch.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#29Bruce Momjian
bruce@momjian.us
In reply to: Fujii Masao (#28)
Re: Replication server timeout patch

Fujii Masao wrote:

On Fri, Mar 11, 2011 at 10:18 PM, Robert Haas <robertmhaas@gmail.com> wrote:

I added this replication timeout patch into next CF.

I explain why this feature is required for the future review;

Without this feature, walsender might unexpectedly remain for a while when
the standby crashes or the network outage happens. TCP keepalive can
improve this situation to?a certain extent, but it's not perfect. Remaining
walsender can cause some problems.

For example, when hot_standby_feedback is enabled, such a remaining
walsender would prevent oldest xmin from advancing and interfere with
vacuuming on the master. For example, when you use synchronous
replication and walsender in SYNC mode gets stuck, any synchronous
standby candidate cannot switch to SYNC mode until that walsender exits,
and all the transactions would pause.

This feature causes walsender to exit when there is no reply from the
standby before the replication timeout expires. Then we can avoid the
above problems.

I think we should consider making this change for 9.1. ?This is a real
wart, and it's going to become even more of a problem with sync rep, I
think.

Yeah, that's a welcome! Please feel free to review the patch.

It is already in the next commitfest, so if someone wants to add it as
an open 9.1 item, go ahead. I am unclear of this so I am not adding it.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ It's impossible for everything to be true. +

#30Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#28)
Re: Replication server timeout patch

On Fri, Mar 11, 2011 at 8:29 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

I think we should consider making this change for 9.1.  This is a real
wart, and it's going to become even more of a problem with sync rep, I
think.

Yeah, that's a welcome! Please feel free to review the patch.

I discussed this with Heikki on IM.

I think we should rip all the GUC change stuff out of this patch and
just decree that if you set a timeout, you get a timeout. If you set
this inconsistently with wal_receiver_status_interval, then you'll get
lots of disconnects. But that's your problem. This may seem a little
unfriendly, but the logic in here is quite complex and still isn't
going to really provide that much protection against bad
configurations. The only realistic alternative I see is to define
replication_timeout as a multiple of the client's
wal_receiver_status_interval, but that seems quite annoyingly
unfriendly. A single replication_timeout that applies to all slaves
doesn't cover every configuration someone might want, but it's simple
and easy to understand and should cover 95% of cases. If we find that
it's really necessary to be able to customize it further, then we
might go the route of adding the much-discussed standby registration
stuff, where there's a separate config file or system table where you
can stipulate that when a walsender with application_name=foo
connects, you want it to get wal_receiver_status_interval=$FOO. But I
think that complexity can certainly wait until 9.2 or later.

I also think that the default for replication_timeout should not be 0.
Something like 60s seems about right. That way, if you just use the
default settings, you'll get pretty sane behavior - a connectivity
hiccup that lasts more than a minute will bounce the client. We've
already gotten reports of people who thought they were replicating
when they really weren't, and had to fiddle with settings and struggle
to try to make it robust. This should make things a lot nicer for
people out of the box, but it won't if it's disabled out of the box.

On another note, there doesn't appear to be any need to change the
return value of WaitLatchOrSocket().

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#31Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#30)
Re: Replication server timeout patch

On Sat, Mar 12, 2011 at 4:34 AM, Robert Haas <robertmhaas@gmail.com> wrote:

On Fri, Mar 11, 2011 at 8:29 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

I think we should consider making this change for 9.1.  This is a real
wart, and it's going to become even more of a problem with sync rep, I
think.

Yeah, that's a welcome! Please feel free to review the patch.

I discussed this with Heikki on IM.

I think we should rip all the GUC change stuff out of this patch and
just decree that if you set a timeout, you get a timeout.  If you set
this inconsistently with wal_receiver_status_interval, then you'll get
lots of disconnects.  But that's your problem.  This may seem a little
unfriendly, but the logic in here is quite complex and still isn't
going to really provide that much protection against bad
configurations.  The only realistic alternative I see is to define
replication_timeout as a multiple of the client's
wal_receiver_status_interval, but that seems quite annoyingly
unfriendly.  A single replication_timeout that applies to all slaves
doesn't cover every configuration someone might want, but it's simple
and easy to understand and should cover 95% of cases.  If we find that
it's really necessary to be able to customize it further, then we
might go the route of adding the much-discussed standby registration
stuff, where there's a separate config file or system table where you
can stipulate that when a walsender with application_name=foo
connects, you want it to get wal_receiver_status_interval=$FOO.  But I
think that complexity can certainly wait until 9.2 or later.

I also think that the default for replication_timeout should not be 0.
 Something like 60s seems about right.  That way, if you just use the
default settings, you'll get pretty sane behavior - a connectivity
hiccup that lasts more than a minute will bounce the client.  We've
already gotten reports of people who thought they were replicating
when they really weren't, and had to fiddle with settings and struggle
to try to make it robust.  This should make things a lot nicer for
people out of the box, but it won't if it's disabled out of the box.

On another note, there doesn't appear to be any need to change the
return value of WaitLatchOrSocket().

Agreed. I'll change the patch.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#32Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#31)
1 attachment(s)
Re: Replication server timeout patch

On Wed, Mar 16, 2011 at 4:49 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Agreed. I'll change the patch.

Done. I attached the updated patch.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Attachments:

replication_timeout_v6.patchapplication/octet-stream; name=replication_timeout_v6.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 2017,2022 **** SET ENABLE_SEQSCAN TO OFF;
--- 2017,2044 ----
         </para>
        </listitem>
       </varlistentry>
+ 
+      <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+       <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>replication_timeout</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Specifies the maximum time, in milliseconds, to wait for the reply
+         from the standby before terminating replication.  This is useful for
+         the primary server to detect the standby crash or network outage.
+         A value of zero turns this off.  This parameter can only be set in
+         the <filename>postgresql.conf</> file or on the server command line.
+         The default value is 60 seconds.
+        </para>
+        <para>
+         To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+         must be enabled on the standby, and its value must be less than the
+         value of <varname>replication_timeout</>.
+        </para>
+       </listitem>
+      </varlistentry>
       </variablelist>
      </sect2>
  
***************
*** 2215,2220 **** SET ENABLE_SEQSCAN TO OFF;
--- 2237,2247 ----
         the <filename>postgresql.conf</> file or on the server command line.
         The default value is 10 seconds.
        </para>
+       <para>
+        When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+        <varname>wal_receiver_status_interval</> must be enabled, and its value
+        must be less than the value of <varname>replication_timeout</>.
+       </para>
        </listitem>
       </varlistentry>
  
*** a/src/backend/libpq/pqcomm.c
--- b/src/backend/libpq/pqcomm.c
***************
*** 56,61 ****
--- 56,64 ----
   *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
   *		pq_flush		- flush pending output
   *		pq_getbyte_if_available - get a byte if available without blocking
+  *		pq_putbytes_if_writable	- send bytes to connection if writable without blocking
+  *		pq_flush_if_writable	- flush pending output if writable without blocking
+  *		pq_set_nonblocking	- set socket blocking/non-blocking
   *
   * message-level I/O (and old-style-COPY-OUT cruft):
   *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
***************
*** 112,117 **** static char sock_path[MAXPGPATH];
--- 115,121 ----
  
  static char PqSendBuffer[PQ_BUFFER_SIZE];
  static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+ static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
  
  static char PqRecvBuffer[PQ_BUFFER_SIZE];
  static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
***************
*** 127,133 **** static bool DoingCopyOut;
  /* Internal functions */
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
! static int	internal_flush(void);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
--- 131,138 ----
  /* Internal functions */
  static void pq_close(int code, Datum arg);
  static int	internal_putbytes(const char *s, size_t len);
! static int	internal_flush(bool nonblocking);
! static void pq_set_nonblocking(bool nonblocking, int emode);
  
  #ifdef HAVE_UNIX_SOCKETS
  static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
***************
*** 142,148 **** static int	Setup_AF_UNIX(void);
  void
  pq_init(void)
  {
! 	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
--- 147,153 ----
  void
  pq_init(void)
  {
! 	PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
  	PqCommBusy = false;
  	DoingCopyOut = false;
  	on_proc_exit(pq_close, 0);
***************
*** 846,859 **** pq_getbyte_if_available(unsigned char *c)
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! #ifdef WIN32
! 	pgwin32_noblock = 1;
! #else
! 	if (!pg_set_noblock(MyProcPort->sock))
! 		ereport(ERROR,
! 				(errmsg("could not set socket to non-blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = true;
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
--- 851,857 ----
  	}
  
  	/* Temporarily put the socket into non-blocking mode */
! 	pq_set_nonblocking(true, ERROR);
  	PG_TRY();
  	{
  		r = secure_read(MyProcPort, c, 1);
***************
*** 892,916 **** pq_getbyte_if_available(unsigned char *c)
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! #ifdef WIN32
! 		pgwin32_noblock = 0;
! #else
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(FATAL,
! 					(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 		MyProcPort->noblock = false;
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! #ifdef WIN32
! 	pgwin32_noblock = 0;
! #else
! 	if (!pg_set_block(MyProcPort->sock))
! 		ereport(FATAL,
! 				(errmsg("could not set socket to blocking mode: %m")));
! #endif
! 	MyProcPort->noblock = false;
  
  	return r;
  }
--- 890,900 ----
  		 * The rest of the backend code assumes the socket is in blocking
  		 * mode, so treat failure as FATAL.
  		 */
! 		pq_set_nonblocking(false, FATAL);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
! 	pq_set_nonblocking(false, FATAL);
  
  	return r;
  }
***************
*** 1139,1145 **** internal_putbytes(const char *s, size_t len)
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 			if (internal_flush())
  				return EOF;
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
--- 1123,1129 ----
  	{
  		/* If buffer is full, then flush it out */
  		if (PqSendPointer >= PQ_BUFFER_SIZE)
! 			if (internal_flush(false) == EOF)
  				return EOF;
  		amount = PQ_BUFFER_SIZE - PqSendPointer;
  		if (amount > len)
***************
*** 1153,1158 **** internal_putbytes(const char *s, size_t len)
--- 1137,1192 ----
  }
  
  /* --------------------------------
+  *		pq_putbytes_if_writable - send bytes to connection (not flushed
+  *			until pq_flush), if writable
+  *
+  * Returns the number of bytes written without blocking, or EOF if trouble.
+  * --------------------------------
+  */
+ int
+ pq_putbytes_if_writable(const char *s, size_t len)
+ {
+ 	size_t		amount;
+ 	size_t		nwritten = 0;
+ 
+ 	/* Should not be called by old-style COPY OUT */
+ 	Assert(!DoingCopyOut);
+ 	/* No-op if reentrant call */
+ 	if (PqCommBusy)
+ 		return 0;
+ 	PqCommBusy = true;
+ 
+ 	while (len > 0)
+ 	{
+ 		/* If buffer is full, then flush it out */
+ 		if (PqSendPointer >= PQ_BUFFER_SIZE)
+ 		{
+ 			int		r;
+ 
+ 			r = internal_flush(true);
+ 			if (r == 0)
+ 				break;
+ 			if (r == EOF)
+ 			{
+ 				PqCommBusy = false;
+ 				return r;
+ 			}
+ 		}
+ 		amount = PQ_BUFFER_SIZE - PqSendPointer;
+ 		if (amount > len)
+ 			amount = len;
+ 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
+ 		PqSendPointer += amount;
+ 		s += amount;
+ 		len -= amount;
+ 		nwritten += amount;
+ 	}
+ 
+ 	PqCommBusy = false;
+ 	return (int) nwritten;
+ }
+ 
+ /* --------------------------------
   *		pq_flush		- flush pending output
   *
   *		returns 0 if OK, EOF if trouble
***************
*** 1167,1227 **** pq_flush(void)
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush();
  	PqCommBusy = false;
! 	return res;
  }
  
  static int
! internal_flush(void)
  {
  	static int	last_reported_send_errno = 0;
  
! 	char	   *bufptr = PqSendBuffer;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	while (bufptr < bufend)
  	{
! 		int			r;
! 
! 		r = secure_write(MyProcPort, bufptr, bufend - bufptr);
! 
! 		if (r <= 0)
  		{
! 			if (errno == EINTR)
! 				continue;		/* Ok if we were interrupted */
  
! 			/*
! 			 * Careful: an ereport() that tries to write to the client would
! 			 * cause recursion to here, leading to stack overflow and core
! 			 * dump!  This message must go *only* to the postmaster log.
! 			 *
! 			 * If a client disconnects while we're in the midst of output, we
! 			 * might write quite a bit of data before we get to a safe query
! 			 * abort point.  So, suppress duplicate log messages.
! 			 */
! 			if (errno != last_reported_send_errno)
  			{
! 				last_reported_send_errno = errno;
! 				ereport(COMMERROR,
! 						(errcode_for_socket_access(),
! 						 errmsg("could not send data to client: %m")));
  			}
  
! 			/*
! 			 * We drop the buffered data anyway so that processing can
! 			 * continue, even though we'll probably quit soon.
! 			 */
! 			PqSendPointer = 0;
! 			return EOF;
  		}
- 
- 		last_reported_send_errno = 0;	/* reset after any successful send */
- 		bufptr += r;
  	}
  
! 	PqSendPointer = 0;
! 	return 0;
  }
  
  
--- 1201,1362 ----
  	if (PqCommBusy)
  		return 0;
  	PqCommBusy = true;
! 	res = internal_flush(false);
  	PqCommBusy = false;
! 	return (res == 1) ? 0 : EOF;
  }
  
+ /* --------------------------------
+  *		internal_flush - flush pending output
+  *
+  * Returns 1 if OK, 0 if pending output cannot be written without blocking
+  * (only possible nonblocking is true), or EOF if trouble.
+  * --------------------------------
+  */
  static int
! internal_flush(bool nonblocking)
  {
  	static int	last_reported_send_errno = 0;
+ 	int		r;
  
! 	char	   *bufptr = PqSendBuffer + PqSendStart;
  	char	   *bufend = PqSendBuffer + PqSendPointer;
  
! 	/* Temporarily put the socket into non-blocking mode */
! 	if (nonblocking)
! 		pq_set_nonblocking(true, ERROR);
! 	PG_TRY();
  	{
! 		while (bufptr < bufend)
  		{
! 			r = secure_write(MyProcPort, bufptr, bufend - bufptr);
  
! 			if (r <= 0)
  			{
! 				/* Ok if we were interrupted in blocking mode */
! 				if (!nonblocking && errno == EINTR)
! 					continue;
! 
! 				if (nonblocking)
! 				{
! 					if (r == 0)
! 						r = EOF;	/* EOF detected */
! 					else if (errno == EAGAIN ||
! 							 errno == EWOULDBLOCK ||
! 							 errno == EINTR)
! 					{
! 						/*
! 						 * Ok if no data writable without blocking or
! 						 * interrupted (though EINTR really shouldn't
! 						 * happen with a non-blocking socket). Report
! 						 * other errors.
! 						 */
! 						r = 0;
! 					}
! 					break;
! 				}
! 
! 				/*
! 				 * Careful: an ereport() that tries to write to the
! 				 * client would cause recursion to here, leading to
! 				 * stack overflow and core dump!  This message must
! 				 * go *only* to the postmaster log.
! 				 *
! 				 * If a client disconnects while we're in the midst
! 				 * of output, we might write quite a bit of data before
! 				 * we get to a safe query abort point.  So, suppress
! 				 * duplicate log messages.
! 				 */
! 				if (errno != last_reported_send_errno)
! 				{
! 					last_reported_send_errno = errno;
! 					ereport(COMMERROR,
! 							(errcode_for_socket_access(),
! 							 errmsg("could not send data to client: %m")));
! 				}
! 
! 				/*
! 				 * We drop the buffered data anyway so that processing can
! 				 * continue, even though we'll probably quit soon.
! 				 */
! 				PqSendStart = PqSendPointer = 0;
! 				r = EOF;
! 				break;
  			}
  
! 			last_reported_send_errno = 0;	/* reset after any successful send */
! 			bufptr += r;
! 			PqSendStart += r;
  		}
  	}
+ 	PG_CATCH();
+ 	{
+ 		/*
+ 		 * The rest of the backend code assumes the socket is in blocking
+ 		 * mode, so treat failure as FATAL.
+ 		 */
+ 		if (nonblocking)
+ 			pq_set_nonblocking(false, FATAL);
+ 	}
+ 	PG_END_TRY();
+ 	if (nonblocking)
+ 		pq_set_nonblocking(false, FATAL);
  
! 	if (r == 0 || r == EOF)
! 		return r;
! 
! 	PqSendStart = PqSendPointer = 0;
! 	return 1;
! }
! 
! /* --------------------------------
!  *		pq_flush_if_writable - flush pending output if writable
!  *
!  * Returns 1 if OK, 0 if pending output cannot be written without blocking,
!  * or EOF if trouble.
!  * --------------------------------
!  */
! int
! pq_flush_if_writable(void)
! {
! 	int			res;
! 
! 	/* No-op if reentrant call */
! 	if (PqCommBusy)
! 		return 0;
! 	PqCommBusy = true;
! 	res = internal_flush(true);
! 	PqCommBusy = false;
! 	return res;
! }
! 
! /* --------------------------------
!  *		pq_set_nonblocking - set socket blocking/non-blocking
!  *
!  * Sets the socket non-blocking if nonblocking is TRUE, or sets it
!  * blocking otherwise.
!  * --------------------------------
!  */
! static
! void pq_set_nonblocking(bool nonblocking, int emode)
! {
! #ifdef WIN32
! 	pgwin32_noblock = nonblocking ? 1 : 0;
! #else
! 	if (nonblocking)
! 	{
! 		if (!pg_set_noblock(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to non-blocking mode: %m")));
! 	}
! 	else
! 	{
! 		if (!pg_set_block(MyProcPort->sock))
! 			ereport(emode,
! 					(errmsg("could not set socket to blocking mode: %m")));
! 	}
! #endif
! 	MyProcPort->noblock = nonblocking;
  }
  
  
*** a/src/backend/port/unix_latch.c
--- b/src/backend/port/unix_latch.c
***************
*** 193,211 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
!  * was set, or 2 if the scoket became readable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
  	int			rc;
  	int			result = 0;
  
--- 193,213 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  /*
   * Like WaitLatch, but will also return when there's data available in
!  * 'sock' for reading or writing. Returns 0 if timeout was reached,
!  * 1 if the latch was set, 2 if the scoket became readable or writable.
   */
  int
! WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	struct timeval tv, *tvp = NULL;
  	fd_set		input_mask;
+ 	fd_set		output_mask;
  	int			rc;
  	int			result = 0;
  
***************
*** 241,254 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
--- 243,264 ----
  		FD_ZERO(&input_mask);
  		FD_SET(selfpipe_readfd, &input_mask);
  		hifd = selfpipe_readfd;
! 		if (sock != PGINVALID_SOCKET && forRead)
  		{
  			FD_SET(sock, &input_mask);
  			if (sock > hifd)
  				hifd = sock;
  		}
  
! 		FD_ZERO(&output_mask);
! 		if (sock != PGINVALID_SOCKET && forWrite)
! 		{
! 			FD_SET(sock, &output_mask);
! 			if (sock > hifd)
! 				hifd = sock;
! 		}
! 
! 		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
  		if (rc < 0)
  		{
  			if (errno == EINTR)
***************
*** 263,269 **** WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
  		{
  			result = 2;
  			break;		/* data available in socket */
--- 273,281 ----
  			result = 0;
  			break;
  		}
! 		if (sock != PGINVALID_SOCKET &&
! 			((forRead && FD_ISSET(sock, &input_mask)) ||
! 			 (forWrite && FD_ISSET(sock, &output_mask))))
  		{
  			result = 2;
  			break;		/* data available in socket */
*** a/src/backend/port/win32/socket.c
--- b/src/backend/port/win32/socket.c
***************
*** 14,20 ****
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() should operate in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
--- 14,21 ----
  #include "postgres.h"
  
  /*
!  * Indicate if pgwin32_recv() and pgwin32_send() should operate
!  * in non-blocking mode.
   *
   * Since the socket emulation layer always sets the actual socket to
   * non-blocking mode in order to be able to deliver signals, we must
***************
*** 399,404 **** pgwin32_send(SOCKET s, char *buf, int len, int flags)
--- 400,415 ----
  			return -1;
  		}
  
+ 		if (pgwin32_noblock)
+ 		{
+ 			/*
+ 			 * No data sent, and we are in "emulated non-blocking mode", so
+ 			 * return indicating that we'd block if we were to continue.
+ 			 */
+ 			errno = EWOULDBLOCK;
+ 			return -1;
+ 		}
+ 
  		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
  
  		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
*** a/src/backend/port/win32_latch.c
--- b/src/backend/port/win32_latch.c
***************
*** 85,95 **** DisownLatch(volatile Latch *latch)
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
--- 85,96 ----
  bool
  WaitLatch(volatile Latch *latch, long timeout)
  {
! 	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
  }
  
  int
! WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
! 				  bool forWrite, long timeout)
  {
  	DWORD		rc;
  	HANDLE		events[3];
***************
*** 103,112 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET)
  	{
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, FD_READ);
  		events[numevents++] = sockevent;
  	}
  
--- 104,120 ----
  	events[0] = latchevent;
  	events[1] = pgwin32_signal_event;
  	numevents = 2;
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
+ 		int		flags = 0;
+ 
+ 		if (forRead)
+ 			flags |= FD_READ;
+ 		if (forWrite)
+ 			flags |= FD_WRITE;
+ 
  		sockevent = WSACreateEvent();
! 		WSAEventSelect(sock, sockevent, flags);
  		events[numevents++] = sockevent;
  	}
  
***************
*** 139,146 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
  			Assert(sock != PGINVALID_SOCKET);
! 			result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
--- 147,164 ----
  			pgwin32_dispatch_queued_signals();
  		else if (rc == WAIT_OBJECT_0 + 2)
  		{
+ 			WSANETWORKEVENTS resEvents;
+ 
  			Assert(sock != PGINVALID_SOCKET);
! 
! 			ZeroMemory(&resEvents, sizeof(resEvents));
! 			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
! 				ereport(FATAL,
! 						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
! 
! 			if ((forRead && resEvents.lNetworkEvents & FD_READ) ||
! 				(forWrite && resEvents.lNetworkEvents & FD_WRITE))
! 				result = 2;
  			break;
  		}
  		else if (rc != WAIT_OBJECT_0)
***************
*** 148,154 **** WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
  	}
  
  	/* Clean up the handle we created for the socket */
! 		if (sock != PGINVALID_SOCKET)
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
--- 166,172 ----
  	}
  
  	/* Clean up the handle we created for the socket */
! 	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
  	{
  		WSAEventSelect(sock, sockevent, 0);
  		WSACloseEvent(sockevent);
*** a/src/backend/replication/walsender.c
--- b/src/backend/replication/walsender.c
***************
*** 74,79 **** bool		am_walsender = false;		/* Am I a walsender process ? */
--- 74,91 ----
  /* User-settable parameters for walsender */
  int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
  int			WalSndDelay = 1000;	/* max sleep time between some actions */
+ int			replication_timeout = 60 * 1000;	/* maximum time to send one WAL data message */
+ 
+ /*
+  * Buffer for WAL sending
+  *
+  * WalSndOutBuffer is a work area in which the output message is constructed.
+  * It's used in just so we can avoid re-palloc'ing the buffer on each cycle.
+  * It must be of size 6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
+  */
+ static char	   *WalSndOutBuffer;
+ static int		WalSndOutHead;		/* head of pending output */
+ static int		WalSndOutTail;		/* tail of pending output */
  
  /*
   * These variables are used similarly to openLogFile/Id/Seg/Off,
***************
*** 95,100 **** static XLogRecPtr sentPtr = {0, 0};
--- 107,117 ----
   */
  static StringInfoData reply_message;
  
+ /*
+  * Timestamp of the last receipt of the reply from the standby.
+  */
+ static TimestampTz last_reply_timestamp;
+ 
  /* Flags set by signal handlers for later service in main loop */
  static volatile sig_atomic_t got_SIGHUP = false;
  volatile sig_atomic_t walsender_shutdown_requested = false;
***************
*** 113,119 **** static int	WalSndLoop(void);
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(char *msgbuf, bool *caughtup);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
--- 130,136 ----
  static void InitWalSnd(void);
  static void WalSndHandshake(void);
  static void WalSndKill(int code, Datum arg);
! static bool XLogSend(bool *caughtup, bool *pending);
  static void IdentifySystem(void);
  static void StartReplication(StartReplicationCmd * cmd);
  static void ProcessStandbyMessage(void);
***************
*** 469,474 **** ProcessRepliesIfAny(void)
--- 486,492 ----
  {
  	unsigned char firstchar;
  	int			r;
+ 	int		received = false;
  
  	for (;;)
  	{
***************
*** 481,489 **** ProcessRepliesIfAny(void)
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)
  		{
! 			/* no data available without blocking */
  			return;
  		}
  
--- 499,512 ----
  					 errmsg("unexpected EOF on standby connection")));
  			proc_exit(0);
  		}
! 		if (r == 0)	/* no data available without blocking */
  		{
! 			/*
! 			 * Save the last reply timestamp if we've received at least
! 			 * one reply.
! 			 */
! 			if (received)
! 				last_reply_timestamp = GetCurrentTimestamp();
  			return;
  		}
  
***************
*** 495,500 **** ProcessRepliesIfAny(void)
--- 518,524 ----
  				 */
  			case 'd':
  				ProcessStandbyMessage();
+ 				received = true;
  				break;
  
  				/*
***************
*** 673,687 **** ProcessStandbyHSFeedbackMessage(void)
  static int
  WalSndLoop(void)
  {
- 	char	   *output_message;
  	bool		caughtup = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
--- 697,712 ----
  static int
  WalSndLoop(void)
  {
  	bool		caughtup = false;
+ 	bool		pending = false;
  
  	/*
  	 * Allocate buffer that will be used for each output message.  We do this
  	 * just once to reduce palloc overhead.  The buffer must be made large
  	 * enough for maximum-sized messages.
  	 */
! 	WalSndOutBuffer = palloc(6 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);
! 	WalSndOutHead = WalSndOutTail = 0;
  
  	/*
  	 * Allocate buffer that will be used for processing reply messages.  As
***************
*** 689,694 **** WalSndLoop(void)
--- 714,722 ----
  	 */
  	initStringInfo(&reply_message);
  
+ 	/* Initialize the last reply timestamp */
+ 	last_reply_timestamp = GetCurrentTimestamp();
+ 
  	/* Loop forever, unless we get an error */
  	for (;;)
  	{
***************
*** 713,722 **** WalSndLoop(void)
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup)
  				walsender_shutdown_requested = true;
  		}
  
--- 741,750 ----
  		 */
  		if (walsender_ready_to_stop)
  		{
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  			ProcessRepliesIfAny();
! 			if (caughtup && !pending)
  				walsender_shutdown_requested = true;
  		}
  
***************
*** 731,740 **** WalSndLoop(void)
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round, nap for the
! 		 * configured time before retrying.
  		 */
! 		if (caughtup)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
--- 759,769 ----
  		}
  
  		/*
! 		 * If we had sent all accumulated WAL in last round or could not
! 		 * flush pending WAL in output buffer because the socket was not
! 		 * writable, nap for the configured time before retrying.
  		 */
! 		if (caughtup || pending)
  		{
  			/*
  			 * Even if we wrote all the WAL that was available when we started
***************
*** 745,769 **** WalSndLoop(void)
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(output_message, &caughtup))
  				break;
! 			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
  			{
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  WalSndDelay * 1000L);
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(output_message, &caughtup))
  				break;
  		}
  
--- 774,834 ----
  			 */
  			ResetLatch(&MyWalSnd->latch);
  
! 			if (!XLogSend(&caughtup, &pending))
  				break;
! 			if ((caughtup || pending) && !got_SIGHUP && !walsender_ready_to_stop &&
! 					!walsender_shutdown_requested)
  			{
+ 				TimestampTz	finish_time;
+ 				long		sleeptime;
+ 
  				/*
  				 * XXX: We don't really need the periodic wakeups anymore,
  				 * WaitLatchOrSocket should reliably wake up as soon as
  				 * something interesting happens.
  				 */
  
+ 				/* Reschedule replication timeout */
+ 				if (replication_timeout > 0)
+ 				{
+ 					long		secs;
+ 					int		usecs;
+ 
+ 					finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+ 											replication_timeout);
+ 					TimestampDifference(GetCurrentTimestamp(),
+ 								finish_time, &secs, &usecs);
+ 					sleeptime = secs * 1000 + usecs / 1000;
+ 					if (WalSndDelay < sleeptime)
+ 						sleeptime = WalSndDelay;
+ 				}
+ 				else
+ 					sleeptime = WalSndDelay;
+ 
  				/* Sleep */
  				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
! 								  true, (WalSndOutTail > 0),
! 								  sleeptime * 1000L);
! 
! 				/* Check for replication timeout */
! 				if (replication_timeout > 0 &&
! 					GetCurrentTimestamp() >= finish_time)
! 				{
! 					/*
! 					 * Since typically expiration of replication timeout means
! 					 * communication problem, we don't send the error message
! 					 * to the standby.
! 					 */
! 					ereport(COMMERROR,
! 							(errmsg("terminating walsender process due to replication timeout")));
! 					break;
! 				}
  			}
  		}
  		else
  		{
  			/* Attempt to send the log once every loop */
! 			if (!XLogSend(&caughtup, &pending))
  				break;
  		}
  
***************
*** 996,1019 **** XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
-  * msgbuf is a work area in which the output message is constructed.  It's
-  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
-  * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
-  *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(char *msgbuf, bool *caughtup)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	XLogRecPtr	endptr;
  	Size		nbytes;
  	WalDataMessageHeader msghdr;
  
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
--- 1061,1108 ----
   * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
   * but not yet sent to the client, and send it.
   *
   * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
   * *caughtup is set to false.
   *
+  * If there is pending WAL in output buffer, *pending is set to true,
+  * otherwise *pending is set to false.
+  *
   * Returns true if OK, false if trouble.
   */
  static bool
! XLogSend(bool *caughtup, bool *pending)
  {
  	XLogRecPtr	SendRqstPtr;
  	XLogRecPtr	startptr;
! 	static XLogRecPtr	endptr;
  	Size		nbytes;
+ 	uint32		n32;
+ 	int			res;
  	WalDataMessageHeader msghdr;
  
+ 	/* Attempt to flush pending WAL in output buffer */
+ 	if (*pending)
+ 	{
+ 		if (WalSndOutHead != WalSndOutTail)
+ 		{
+ 			res = pq_putbytes_if_writable(WalSndOutBuffer + WalSndOutHead,
+ 										  WalSndOutTail - WalSndOutHead);
+ 			if (res == EOF)
+ 				return false;
+ 			WalSndOutHead += res;
+ 			if (WalSndOutHead != WalSndOutTail)
+ 				return true;
+ 		}
+ 
+ 		res = pq_flush_if_writable();
+ 		if (res == EOF)
+ 			return false;
+ 		if (res == 0)
+ 			return true;
+ 
+ 		goto updt;
+ 	}
+ 
  	/*
  	 * Attempt to send all data that's already been written out and fsync'd to
  	 * disk.  We cannot go further than what's been written out given the
***************
*** 1082,1094 **** XLogSend(char *msgbuf, bool *caughtup)
  	/*
  	 * OK to read and send the slice.
  	 */
! 	msgbuf[0] = 'w';
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
--- 1171,1189 ----
  	/*
  	 * OK to read and send the slice.
  	 */
! 	WalSndOutBuffer[0] = 'd';
! 	WalSndOutBuffer[5] = 'w';
! 	WalSndOutHead = 0;
! 	WalSndOutTail = 6 + sizeof(WalDataMessageHeader) + nbytes;
! 
! 	n32 = htonl((uint32) WalSndOutTail - 1);
! 	memcpy(WalSndOutBuffer + 1, &n32, 4);
  
  	/*
  	 * Read the log directly into the output buffer to avoid extra memcpy
  	 * calls.
  	 */
! 	XLogRead(WalSndOutBuffer + 6 + sizeof(WalDataMessageHeader), startptr, nbytes);
  
  	/*
  	 * We fill the message header last so that the send timestamp is taken as
***************
*** 1098,1110 **** XLogSend(char *msgbuf, bool *caughtup)
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
  
! 	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
  
  	/* Flush pending output to the client */
! 	if (pq_flush())
  		return false;
  
  	sentPtr = endptr;
  
--- 1193,1226 ----
  	msghdr.walEnd = SendRqstPtr;
  	msghdr.sendTime = GetCurrentTimestamp();
  
! 	memcpy(WalSndOutBuffer + 6, &msghdr, sizeof(WalDataMessageHeader));
  
! 	res = pq_putbytes_if_writable(WalSndOutBuffer, WalSndOutTail);
! 	if (res == EOF)
! 		return false;
! 
! 	WalSndOutHead = res;
! 	if (WalSndOutHead != WalSndOutTail)
! 	{
! 		*caughtup = false;
! 		*pending = true;
! 		return true;
! 	}
  
  	/* Flush pending output to the client */
! 	res = pq_flush_if_writable();
! 	if (res == EOF)
  		return false;
+ 	if (res == 0)
+ 	{
+ 		*caughtup = false;
+ 		*pending = true;
+ 		return true;
+ 	}
+ 
+ updt:
+ 	WalSndOutHead = WalSndOutTail = 0;
+ 	*pending = false;
  
  	sentPtr = endptr;
  
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 1856,1861 **** static struct config_int ConfigureNamesInt[] =
--- 1856,1871 ----
  	},
  
  	{
+ 		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+ 			gettext_noop("Sets the maximum time to wait for WAL replication."),
+ 			NULL,
+ 			GUC_UNIT_MS
+ 		},
+ 		&replication_timeout,
+ 		60 * 1000, 0, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
  			gettext_noop("Sets the delay in microseconds between transaction commit and "
  						 "flushing WAL to disk."),
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 200,205 ****
--- 200,206 ----
  #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
  #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
  #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+ #replication_timeout = 60s # in milliseconds, 0 is disabled
  
  # - Standby Servers -
  
*** a/src/include/libpq/libpq.h
--- b/src/include/libpq/libpq.h
***************
*** 59,65 **** extern int	pq_getbyte(void);
--- 59,67 ----
  extern int	pq_peekbyte(void);
  extern int	pq_getbyte_if_available(unsigned char *c);
  extern int	pq_putbytes(const char *s, size_t len);
+ extern int	pq_putbytes_if_writable(const char *s, size_t len);
  extern int	pq_flush(void);
+ extern int	pq_flush_if_writable(void);
  extern int	pq_putmessage(char msgtype, const char *s, size_t len);
  extern void pq_startcopyout(void);
  extern void pq_endcopyout(bool errorAbort);
*** a/src/include/replication/walsender.h
--- b/src/include/replication/walsender.h
***************
*** 91,96 **** extern volatile sig_atomic_t walsender_ready_to_stop;
--- 91,97 ----
  /* user-settable parameters */
  extern int	WalSndDelay;
  extern int	max_wal_senders;
+ extern int	replication_timeout;
  
  extern int	WalSenderMain(void);
  extern void WalSndSignals(void);
*** a/src/include/storage/latch.h
--- b/src/include/storage/latch.h
***************
*** 40,46 **** extern void OwnLatch(volatile Latch *latch);
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
--- 40,46 ----
  extern void DisownLatch(volatile Latch *latch);
  extern bool WaitLatch(volatile Latch *latch, long timeout);
  extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
! 				  bool forRead, bool forWrite, long timeout);
  extern void SetLatch(volatile Latch *latch);
  extern void ResetLatch(volatile Latch *latch);
  #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#33Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#32)
1 attachment(s)
Re: Replication server timeout patch

On 16.03.2011 11:11, Fujii Masao wrote:

On Wed, Mar 16, 2011 at 4:49 PM, Fujii Masao<masao.fujii@gmail.com> wrote:

Agreed. I'll change the patch.

Done. I attached the updated patch.

I don't much like the API for this. Walsender shouldn't need to know
about the details of the FE/BE protocol, pq_putbytes_if_available()
seems too low level to be useful.

I think a better API would be to have a non-blocking version of
pq_putmessage(). We can make the output buffer in pqcomm.c resizeable,
so that when the message doesn't fit in the output buffer in
pq_putmessage(), the buffer is enlarged instead of trying to flush it.

Attached is a patch using that approach. This is a much smaller patch,
and easier to understand. I'm not totally happy with the walsender main
loop, it seems to work as it is, but the logic has become quite
complicated. Ideas welcome on how to simplify that.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

Attachments:

replication_timeout_v7.patchtext/x-diff; name=replication_timeout_v7.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e0ebee6..3192ef7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2019,6 +2019,28 @@ SET ENABLE_SEQSCAN TO OFF;
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+      <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+      <indexterm>
+       <primary><varname>replication_timeout</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the maximum time, in milliseconds, to wait for the reply
+        from the standby before terminating replication.  This is useful for
+        the primary server to detect the standby crash or network outage.
+        A value of zero turns this off.  This parameter can only be set in
+        the <filename>postgresql.conf</> file or on the server command line.
+        The default value is 60 seconds.
+       </para>
+       <para>
+        To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+        must be enabled on the standby, and its value must be less than the
+        value of <varname>replication_timeout</>.
+       </para>
+      </listitem>
+     </varlistentry>
      </variablelist>
     </sect2>
 
@@ -2216,6 +2238,11 @@ SET ENABLE_SEQSCAN TO OFF;
        the <filename>postgresql.conf</> file or on the server command line.
        The default value is 10 seconds.
       </para>
+      <para>
+       When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+       <varname>wal_receiver_status_interval</> must be enabled, and its value
+       must be less than the value of <varname>replication_timeout</>.
+      </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 3c7b05b..b6dc8cc 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -56,9 +56,11 @@
  *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
  *		pq_flush		- flush pending output
  *		pq_getbyte_if_available - get a byte if available without blocking
+ *		pq_flush_if_writable	- flush pending output if writable without blocking
  *
  * message-level I/O (and old-style-COPY-OUT cruft):
  *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
+ *		pq_putmessage_noblock - buffer a normal message without blocking (suppressed in COPY OUT mode)
  *		pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
  *		pq_endcopyout	- end a COPY OUT transfer
  *
@@ -92,6 +94,7 @@
 #include "miscadmin.h"
 #include "storage/ipc.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 
 /*
  * Configuration options
@@ -108,12 +111,15 @@ static char sock_path[MAXPGPATH];
  * Buffers for low-level I/O
  */
 
-#define PQ_BUFFER_SIZE 8192
+#define PQ_SEND_BUFFER_SIZE 8192
+#define PQ_RECV_BUFFER_SIZE 8192
 
-static char PqSendBuffer[PQ_BUFFER_SIZE];
+static char *PqSendBuffer;
+static int	PqSendBufferSize;
 static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
 
-static char PqRecvBuffer[PQ_BUFFER_SIZE];
+static char PqRecvBuffer[PQ_RECV_BUFFER_SIZE];
 static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
 static int	PqRecvLength;		/* End of data available in PqRecvBuffer */
 
@@ -142,7 +148,9 @@ static int	Setup_AF_UNIX(void);
 void
 pq_init(void)
 {
-	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
+	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
+	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
+	PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
 	PqCommBusy = false;
 	DoingCopyOut = false;
 	on_proc_exit(pq_close, 0);
@@ -762,7 +770,7 @@ pq_recvbuf(void)
 		int			r;
 
 		r = secure_read(MyProcPort, PqRecvBuffer + PqRecvLength,
-						PQ_BUFFER_SIZE - PqRecvLength);
+						PQ_RECV_BUFFER_SIZE - PqRecvLength);
 
 		if (r < 0)
 		{
@@ -1138,10 +1146,10 @@ internal_putbytes(const char *s, size_t len)
 	while (len > 0)
 	{
 		/* If buffer is full, then flush it out */
-		if (PqSendPointer >= PQ_BUFFER_SIZE)
+		if (PqSendPointer >= PqSendBufferSize)
 			if (internal_flush())
 				return EOF;
-		amount = PQ_BUFFER_SIZE - PqSendPointer;
+		amount = PqSendBufferSize - PqSendPointer;
 		if (amount > len)
 			amount = len;
 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
@@ -1172,12 +1180,19 @@ pq_flush(void)
 	return res;
 }
 
+/* --------------------------------
+ *		internal_flush - flush pending output
+ *
+ * Returns 0 if OK (meaning everything was sent, or operation would block
+ * and the socket is in non-blocking mode), or EOF if trouble.
+ * --------------------------------
+ */
 static int
 internal_flush(void)
 {
 	static int	last_reported_send_errno = 0;
 
-	char	   *bufptr = PqSendBuffer;
+	char	   *bufptr = PqSendBuffer + PqSendStart;
 	char	   *bufend = PqSendBuffer + PqSendPointer;
 
 	while (bufptr < bufend)
@@ -1192,6 +1207,16 @@ internal_flush(void)
 				continue;		/* Ok if we were interrupted */
 
 			/*
+			 * Ok if no data writable without blocking, and the socket
+			 * is in non-blocking mode.
+			 */
+			if (errno == EAGAIN ||
+				errno == EWOULDBLOCK)
+			{
+				return 0;
+			}
+
+			/*
 			 * Careful: an ereport() that tries to write to the client would
 			 * cause recursion to here, leading to stack overflow and core
 			 * dump!  This message must go *only* to the postmaster log.
@@ -1212,18 +1237,74 @@ internal_flush(void)
 			 * We drop the buffered data anyway so that processing can
 			 * continue, even though we'll probably quit soon.
 			 */
-			PqSendPointer = 0;
+			PqSendStart = PqSendPointer = 0;
 			return EOF;
 		}
 
 		last_reported_send_errno = 0;	/* reset after any successful send */
 		bufptr += r;
+		PqSendStart += r;
 	}
 
-	PqSendPointer = 0;
+	PqSendStart = PqSendPointer = 0;
 	return 0;
 }
 
+/* --------------------------------
+ *		pq_flush_if_writable - flush pending output if writable
+ *
+ * Returns 0 if OK, or EOF if trouble.
+ * --------------------------------
+ */
+int
+pq_flush_if_writable(void)
+{
+	int			res;
+
+	/* Quick exit if nothing to do */
+	if (PqSendPointer == PqSendStart)
+		return 0;
+
+	/* No-op if reentrant call */
+	if (PqCommBusy)
+		return 0;
+
+	PqCommBusy = true;
+
+	/* Temporarily put the socket into non-blocking mode */
+#ifdef WIN32
+	pgwin32_noblock = 1;
+#else
+	if (!pg_set_noblock(MyProcPort->sock))
+		ereport(ERROR,
+				(errmsg("could not set socket to non-blocking mode: %m")));
+#endif
+	MyProcPort->noblock = true;
+
+	res = internal_flush();
+
+#ifdef WIN32
+	pgwin32_noblock = 0;
+#else
+	if (!pg_set_block(MyProcPort->sock))
+		ereport(FATAL,
+				(errmsg("could not set socket to blocking mode: %m")));
+#endif
+	MyProcPort->noblock = false;
+
+	PqCommBusy = false;
+	return res;
+}
+
+/* --------------------------------
+ *		pq_is_send_pending	- is there any pending data in the output buffer?
+ * --------------------------------
+ */
+bool
+pq_is_send_pending(void)
+{
+	return (PqSendStart < PqSendPointer);
+}
 
 /* --------------------------------
  * Message-level I/O routines begin here.
@@ -1286,6 +1367,25 @@ fail:
 }
 
 /* --------------------------------
+ *		pq_putmessage_noblock	- like pq_putmessage, but never blocks
+ *
+ *		If the output buffer is too small to hold the message, the buffer
+ *		is enlarged.
+ */
+int
+pq_putmessage_noblock(char msgtype, const char *s, size_t len)
+{
+	int required = PqSendPointer + len + 5 ;
+	if (required > PqSendBufferSize)
+	{
+		PqSendBuffer = repalloc(PqSendBuffer, required);
+		PqSendBufferSize = required;
+	}
+	return pq_putmessage(msgtype, s, len);
+}
+
+
+/* --------------------------------
  *		pq_startcopyout - inform libpq that an old-style COPY OUT transfer
  *			is beginning
  * --------------------------------
diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index a4f559e..32d0cb5 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -193,19 +193,21 @@ DisownLatch(volatile Latch *latch)
 bool
 WaitLatch(volatile Latch *latch, long timeout)
 {
-	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
+	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
 }
 
 /*
  * Like WaitLatch, but will also return when there's data available in
- * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
- * was set, or 2 if the scoket became readable.
+ * 'sock' for reading or writing. Returns 0 if timeout was reached,
+ * 1 if the latch was set, 2 if the socket became readable or writable.
  */
 int
-WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
+WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
+				  bool forWrite, long timeout)
 {
 	struct timeval tv, *tvp = NULL;
 	fd_set		input_mask;
+	fd_set		output_mask;
 	int			rc;
 	int			result = 0;
 
@@ -241,14 +243,22 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
 		FD_ZERO(&input_mask);
 		FD_SET(selfpipe_readfd, &input_mask);
 		hifd = selfpipe_readfd;
-		if (sock != PGINVALID_SOCKET)
+		if (sock != PGINVALID_SOCKET && forRead)
 		{
 			FD_SET(sock, &input_mask);
 			if (sock > hifd)
 				hifd = sock;
 		}
 
-		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
+		FD_ZERO(&output_mask);
+		if (sock != PGINVALID_SOCKET && forWrite)
+		{
+			FD_SET(sock, &output_mask);
+			if (sock > hifd)
+				hifd = sock;
+		}
+
+		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
 		if (rc < 0)
 		{
 			if (errno == EINTR)
@@ -263,7 +273,9 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
 			result = 0;
 			break;
 		}
-		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
+		if (sock != PGINVALID_SOCKET &&
+			((forRead && FD_ISSET(sock, &input_mask)) ||
+			 (forWrite && FD_ISSET(sock, &output_mask))))
 		{
 			result = 2;
 			break;		/* data available in socket */
diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c
index 76dd6be..dbbd4a3 100644
--- a/src/backend/port/win32/socket.c
+++ b/src/backend/port/win32/socket.c
@@ -14,7 +14,8 @@
 #include "postgres.h"
 
 /*
- * Indicate if pgwin32_recv() should operate in non-blocking mode.
+ * Indicate if pgwin32_recv() and pgwin32_send() should operate
+ * in non-blocking mode.
  *
  * Since the socket emulation layer always sets the actual socket to
  * non-blocking mode in order to be able to deliver signals, we must
@@ -399,6 +400,16 @@ pgwin32_send(SOCKET s, char *buf, int len, int flags)
 			return -1;
 		}
 
+		if (pgwin32_noblock)
+		{
+			/*
+			 * No data sent, and we are in "emulated non-blocking mode", so
+			 * return indicating that we'd block if we were to continue.
+			 */
+			errno = EWOULDBLOCK;
+			return -1;
+		}
+
 		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
 
 		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
diff --git a/src/backend/port/win32_latch.c b/src/backend/port/win32_latch.c
index ac20c49..f42cfef 100644
--- a/src/backend/port/win32_latch.c
+++ b/src/backend/port/win32_latch.c
@@ -85,11 +85,12 @@ DisownLatch(volatile Latch *latch)
 bool
 WaitLatch(volatile Latch *latch, long timeout)
 {
-	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
+	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
 }
 
 int
-WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
+WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
+				  bool forWrite, long timeout)
 {
 	DWORD		rc;
 	HANDLE		events[3];
@@ -103,10 +104,17 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 	events[0] = latchevent;
 	events[1] = pgwin32_signal_event;
 	numevents = 2;
-	if (sock != PGINVALID_SOCKET)
+	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
 	{
+		int		flags = 0;
+
+		if (forRead)
+			flags |= FD_READ;
+		if (forWrite)
+			flags |= FD_WRITE;
+
 		sockevent = WSACreateEvent();
-		WSAEventSelect(sock, sockevent, FD_READ);
+		WSAEventSelect(sock, sockevent, flags);
 		events[numevents++] = sockevent;
 	}
 
@@ -139,8 +147,18 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 			pgwin32_dispatch_queued_signals();
 		else if (rc == WAIT_OBJECT_0 + 2)
 		{
+			WSANETWORKEVENTS resEvents;
+
 			Assert(sock != PGINVALID_SOCKET);
-			result = 2;
+
+			ZeroMemory(&resEvents, sizeof(resEvents));
+			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
+				ereport(FATAL,
+						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
+
+			if ((forRead && resEvents.lNetworkEvents & FD_READ) ||
+				(forWrite && resEvents.lNetworkEvents & FD_WRITE))
+				result = 2;
 			break;
 		}
 		else if (rc != WAIT_OBJECT_0)
@@ -148,7 +166,7 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 	}
 
 	/* Clean up the handle we created for the socket */
-		if (sock != PGINVALID_SOCKET)
+	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
 	{
 		WSAEventSelect(sock, sockevent, 0);
 		WSACloseEvent(sockevent);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f76b5b0..36406d2 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -74,6 +74,7 @@ bool		am_walsender = false;		/* Am I a walsender process ? */
 /* User-settable parameters for walsender */
 int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
 int			WalSndDelay = 1000;	/* max sleep time between some actions */
+int			replication_timeout = 60 * 1000;	/* maximum time to send one WAL data message */
 
 /*
  * These variables are used similarly to openLogFile/Id/Seg/Off,
@@ -95,6 +96,11 @@ static XLogRecPtr sentPtr = {0, 0};
  */
 static StringInfoData reply_message;
 
+/*
+ * Timestamp of the last receipt of the reply from the standby.
+ */
+static TimestampTz last_reply_timestamp;
+
 /* Flags set by signal handlers for later service in main loop */
 static volatile sig_atomic_t got_SIGHUP = false;
 volatile sig_atomic_t walsender_shutdown_requested = false;
@@ -113,7 +119,7 @@ static int	WalSndLoop(void);
 static void InitWalSnd(void);
 static void WalSndHandshake(void);
 static void WalSndKill(int code, Datum arg);
-static bool XLogSend(char *msgbuf, bool *caughtup);
+static void XLogSend(char *msgbuf, bool *caughtup);
 static void IdentifySystem(void);
 static void StartReplication(StartReplicationCmd * cmd);
 static void ProcessStandbyMessage(void);
@@ -469,6 +475,7 @@ ProcessRepliesIfAny(void)
 {
 	unsigned char firstchar;
 	int			r;
+	int		received = false;
 
 	for (;;)
 	{
@@ -484,7 +491,7 @@ ProcessRepliesIfAny(void)
 		if (r == 0)
 		{
 			/* no data available without blocking */
-			return;
+			break;
 		}
 
 		/* Handle the very limited subset of commands expected in this phase */
@@ -495,6 +502,7 @@ ProcessRepliesIfAny(void)
 				 */
 			case 'd':
 				ProcessStandbyMessage();
+				received = true;
 				break;
 
 				/*
@@ -510,6 +518,12 @@ ProcessRepliesIfAny(void)
 								firstchar)));
 		}
 	}
+	/*
+	 * Save the last reply timestamp if we've received at least
+	 * one reply.
+	 */
+	if (received)
+		last_reply_timestamp = GetCurrentTimestamp();
 }
 
 /*
@@ -688,6 +702,9 @@ WalSndLoop(void)
 	 */
 	initStringInfo(&reply_message);
 
+	/* Initialize the last reply timestamp */
+	last_reply_timestamp = GetCurrentTimestamp();
+
 	/* Loop forever, unless we get an error */
 	for (;;)
 	{
@@ -706,19 +723,6 @@ WalSndLoop(void)
 			SyncRepInitConfig();
 		}
 
-		/*
-		 * When SIGUSR2 arrives, we send all outstanding logs up to the
-		 * shutdown checkpoint record (i.e., the latest record) and exit.
-		 */
-		if (walsender_ready_to_stop)
-		{
-			if (!XLogSend(output_message, &caughtup))
-				break;
-			ProcessRepliesIfAny();
-			if (caughtup)
-				walsender_shutdown_requested = true;
-		}
-
 		/* Normal exit from the walsender is here */
 		if (walsender_shutdown_requested)
 		{
@@ -730,11 +734,13 @@ WalSndLoop(void)
 		}
 
 		/*
-		 * If we had sent all accumulated WAL in last round, nap for the
-		 * configured time before retrying.
+		 * If we don't have any pending data in the output buffer, try to
+		 * send some more.
 		 */
-		if (caughtup)
+		if (!pq_is_send_pending())
 		{
+			XLogSend(output_message, &caughtup);
+
 			/*
 			 * Even if we wrote all the WAL that was available when we started
 			 * sending, more might have arrived while we were sending this
@@ -742,28 +748,79 @@ WalSndLoop(void)
 			 * received any signals from that time. Let's arm the latch
 			 * again, and after that check that we're still up-to-date.
 			 */
-			ResetLatch(&MyWalSnd->latch);
-
-			if (!XLogSend(output_message, &caughtup))
-				break;
-			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
+			if (caughtup && !pq_is_send_pending())
 			{
-				/*
-				 * XXX: We don't really need the periodic wakeups anymore,
-				 * WaitLatchOrSocket should reliably wake up as soon as
-				 * something interesting happens.
-				 */
+				ResetLatch(&MyWalSnd->latch);
 
-				/* Sleep */
-				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
-								  WalSndDelay * 1000L);
+				XLogSend(output_message, &caughtup);
 			}
 		}
-		else
+
+		/* Flush pending output to the client */
+		if (pq_flush_if_writable() != 0)
+			break;
+
+		/*
+		 * When SIGUSR2 arrives, we send any outstanding logs up to the
+		 * shutdown checkpoint record (i.e., the latest record) and exit.
+		 */
+		if (walsender_ready_to_stop && !pq_is_send_pending())
 		{
-			/* Attempt to send the log once every loop */
-			if (!XLogSend(output_message, &caughtup))
+			XLogSend(output_message, &caughtup);
+			ProcessRepliesIfAny();
+			if (caughtup && !pq_is_send_pending())
+				walsender_shutdown_requested = true;
+		}
+
+		if ((caughtup || pq_is_send_pending()) &&
+			!got_SIGHUP &&
+			!walsender_shutdown_requested)
+		{
+			TimestampTz	finish_time;
+			long		sleeptime;
+
+			/* Reschedule replication timeout */
+			if (replication_timeout > 0)
+			{
+				long		secs;
+				int		usecs;
+
+				finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+														  replication_timeout);
+				TimestampDifference(GetCurrentTimestamp(),
+									finish_time, &secs, &usecs);
+				sleeptime = secs * 1000 + usecs / 1000;
+				if (WalSndDelay < sleeptime)
+					sleeptime = WalSndDelay;
+			}
+			else
+			{
+				/*
+				 * XXX: Without timeout, we don't really need the periodic
+				 * wakeups anymore, WaitLatchOrSocket should reliably wake up
+				 * as soon as something interesting happens.
+				 */
+				sleeptime = WalSndDelay;
+			}
+
+			/* Sleep */
+			WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
+							  true, pq_is_send_pending(),
+							  sleeptime * 1000L);
+
+			/* Check for replication timeout */
+			if (replication_timeout > 0 &&
+				GetCurrentTimestamp() >= finish_time)
+			{
+				/*
+				 * Since typically expiration of replication timeout means
+				 * communication problem, we don't send the error message
+				 * to the standby.
+				 */
+				ereport(COMMERROR,
+						(errmsg("terminating walsender process due to replication timeout")));
 				break;
+			}
 		}
 
 		/*
@@ -993,7 +1050,8 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
 
 /*
  * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and send it.
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
  *
  * msgbuf is a work area in which the output message is constructed.  It's
  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
@@ -1001,10 +1059,9 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
  *
  * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
  * *caughtup is set to false.
- *
- * Returns true if OK, false if trouble.
+
  */
-static bool
+static void
 XLogSend(char *msgbuf, bool *caughtup)
 {
 	XLogRecPtr	SendRqstPtr;
@@ -1027,7 +1084,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 	if (XLByteLE(SendRqstPtr, sentPtr))
 	{
 		*caughtup = true;
-		return true;
+		return;
 	}
 
 	/*
@@ -1099,11 +1156,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 
 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
 
-	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
-
-	/* Flush pending output to the client */
-	if (pq_flush())
-		return false;
+	pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
 
 	sentPtr = endptr;
 
@@ -1127,7 +1180,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 		set_ps_display(activitymsg, false);
 	}
 
-	return true;
+	return;
 }
 
 /* SIGHUP: set flag to re-read config file at next convenient time */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9ca1329..b49bdae 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1856,6 +1856,16 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+			gettext_noop("Sets the maximum time to wait for WAL replication."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&replication_timeout,
+		60 * 1000, 0, INT_MAX, NULL, NULL
+	},
+
+	{
 		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
 			gettext_noop("Sets the delay in microseconds between transaction commit and "
 						 "flushing WAL to disk."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ed70223..4348185 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,6 +200,7 @@
 #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
 #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
 #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+#replication_timeout = 60s # in milliseconds, 0 is disabled
 
 # - Standby Servers -
 
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 8ecab6d..b20b0c2 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -60,7 +60,10 @@ extern int	pq_peekbyte(void);
 extern int	pq_getbyte_if_available(unsigned char *c);
 extern int	pq_putbytes(const char *s, size_t len);
 extern int	pq_flush(void);
+extern int	pq_flush_if_writable(void);
+extern bool	pq_is_send_pending(void);
 extern int	pq_putmessage(char msgtype, const char *s, size_t len);
+extern int	pq_putmessage_noblock(char msgtype, const char *s, size_t len);
 extern void pq_startcopyout(void);
 extern void pq_endcopyout(bool errorAbort);
 
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 150a71f..2670a2e 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -98,6 +98,7 @@ extern volatile sig_atomic_t walsender_ready_to_stop;
 /* user-settable parameters */
 extern int	WalSndDelay;
 extern int	max_wal_senders;
+extern int	replication_timeout;
 
 extern int	WalSenderMain(void);
 extern void WalSndSignals(void);
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 31744ff..f64e13b 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -40,7 +40,7 @@ extern void OwnLatch(volatile Latch *latch);
 extern void DisownLatch(volatile Latch *latch);
 extern bool WaitLatch(volatile Latch *latch, long timeout);
 extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
-				  long timeout);
+				  bool forRead, bool forWrite, long timeout);
 extern void SetLatch(volatile Latch *latch);
 extern void ResetLatch(volatile Latch *latch);
 #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#34Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#33)
Re: Replication server timeout patch

On Wed, Mar 23, 2011 at 7:33 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

I don't much like the API for this. Walsender shouldn't need to know about
the details of the FE/BE protocol, pq_putbytes_if_available() seems too low
level to be useful.

I think a better API would be to have a non-blocking version of
pq_putmessage(). We can make the output buffer in pqcomm.c resizeable, so
that when the message doesn't fit in the output buffer in pq_putmessage(),
the buffer is enlarged instead of trying to flush it.

Attached is a patch using that approach. This is a much smaller patch, and
easier to understand.

Agreed. Thanks for improving the patch.

pq_flush_if_writable() calls internal_flush() without using PG_TRY block.
This seems unsafe because for example pgwin32_waitforsinglesocket()
called by secure_write() can throw ERROR.

I'm not totally happy with the walsender main loop, it
seems to work as it is, but the logic has become quite complicated. Ideas
welcome on how to simplify that.

As the patch I proposed before did, how about leaving XLogSend() instead
of WalSndLoop() to call pq_flush_if_writable() when there is pending data
in output buffer?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#35Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#33)
Re: Replication server timeout patch

On Wed, Mar 23, 2011 at 6:33 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 16.03.2011 11:11, Fujii Masao wrote:

On Wed, Mar 16, 2011 at 4:49 PM, Fujii Masao<masao.fujii@gmail.com>
 wrote:

Agreed. I'll change the patch.

Done. I attached the updated patch.

I don't much like the API for this. Walsender shouldn't need to know about
the details of the FE/BE protocol, pq_putbytes_if_available() seems too low
level to be useful.

I think a better API would be to have a non-blocking version of
pq_putmessage(). We can make the output buffer in pqcomm.c resizeable, so
that when the message doesn't fit in the output buffer in pq_putmessage(),
the buffer is enlarged instead of trying to flush it.

Attached is a patch using that approach. This is a much smaller patch, and
easier to understand. I'm not totally happy with the walsender main loop, it
seems to work as it is, but the logic has become quite complicated. Ideas
welcome on how to simplify that.

Heikki, are you planning to commit this, either with or without
further revisions?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#36Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#34)
1 attachment(s)
Re: Replication server timeout patch

On 24.03.2011 15:24, Fujii Masao wrote:

On Wed, Mar 23, 2011 at 7:33 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

I don't much like the API for this. Walsender shouldn't need to know about
the details of the FE/BE protocol, pq_putbytes_if_available() seems too low
level to be useful.

I think a better API would be to have a non-blocking version of
pq_putmessage(). We can make the output buffer in pqcomm.c resizeable, so
that when the message doesn't fit in the output buffer in pq_putmessage(),
the buffer is enlarged instead of trying to flush it.

Attached is a patch using that approach. This is a much smaller patch, and
easier to understand.

Agreed. Thanks for improving the patch.

pq_flush_if_writable() calls internal_flush() without using PG_TRY block.
This seems unsafe because for example pgwin32_waitforsinglesocket()
called by secure_write() can throw ERROR.

Perhaps it's time to give up on the assumption that the socket is in
blocking mode except within those two functions. Attached patch adds the
pq_set_nonblocking() function from your patch, and adds calls to it
before all secure_read/write operations to put the socket in the right
mode. There's only a few of those operations.

Should we use COMMERROR instead of ERROR if we fail to put the socket in
the right mode?

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

Attachments:

replication_timeout_v8.patchtext/x-diff; name=replication_timeout_v8.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e0ebee6..3192ef7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2019,6 +2019,28 @@ SET ENABLE_SEQSCAN TO OFF;
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
+      <term><varname>replication_timeout</varname> (<type>integer</type>)</term>
+      <indexterm>
+       <primary><varname>replication_timeout</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the maximum time, in milliseconds, to wait for the reply
+        from the standby before terminating replication.  This is useful for
+        the primary server to detect the standby crash or network outage.
+        A value of zero turns this off.  This parameter can only be set in
+        the <filename>postgresql.conf</> file or on the server command line.
+        The default value is 60 seconds.
+       </para>
+       <para>
+        To make the timeout work properly, <xref linkend="guc-wal-receiver-status-interval">
+        must be enabled on the standby, and its value must be less than the
+        value of <varname>replication_timeout</>.
+       </para>
+      </listitem>
+     </varlistentry>
      </variablelist>
     </sect2>
 
@@ -2216,6 +2238,11 @@ SET ENABLE_SEQSCAN TO OFF;
        the <filename>postgresql.conf</> file or on the server command line.
        The default value is 10 seconds.
       </para>
+      <para>
+       When <xref linkend="guc-replication-timeout"> is enabled on the primary,
+       <varname>wal_receiver_status_interval</> must be enabled, and its value
+       must be less than the value of <varname>replication_timeout</>.
+      </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 3c7b05b..db313a8 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -55,10 +55,12 @@
  *		pq_peekbyte		- peek at next byte from connection
  *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
  *		pq_flush		- flush pending output
+ *		pq_flush_if_writable - flush pending output if writable without blocking
  *		pq_getbyte_if_available - get a byte if available without blocking
  *
  * message-level I/O (and old-style-COPY-OUT cruft):
  *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
+ *		pq_putmessage_noblock - buffer a normal message without blocking (suppressed in COPY OUT mode)
  *		pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
  *		pq_endcopyout	- end a COPY OUT transfer
  *
@@ -92,6 +94,7 @@
 #include "miscadmin.h"
 #include "storage/ipc.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 
 /*
  * Configuration options
@@ -105,15 +108,21 @@ static char sock_path[MAXPGPATH];
 
 
 /*
- * Buffers for low-level I/O
+ * Buffers for low-level I/O.
+ *
+ * The receive buffer is fixed size. Send buffer is usually 8k, but can be
+ * enlarged by pq_putmessage_noblock() if the message doesn't fit otherwise.
  */
 
-#define PQ_BUFFER_SIZE 8192
+#define PQ_SEND_BUFFER_SIZE 8192
+#define PQ_RECV_BUFFER_SIZE 8192
 
-static char PqSendBuffer[PQ_BUFFER_SIZE];
+static char *PqSendBuffer;
+static int	PqSendBufferSize;	/* Size send buffer */
 static int	PqSendPointer;		/* Next index to store a byte in PqSendBuffer */
+static int	PqSendStart;		/* Next index to send a byte in PqSendBuffer */
 
-static char PqRecvBuffer[PQ_BUFFER_SIZE];
+static char PqRecvBuffer[PQ_RECV_BUFFER_SIZE];
 static int	PqRecvPointer;		/* Next index to read a byte from PqRecvBuffer */
 static int	PqRecvLength;		/* End of data available in PqRecvBuffer */
 
@@ -128,6 +137,7 @@ static bool DoingCopyOut;
 static void pq_close(int code, Datum arg);
 static int	internal_putbytes(const char *s, size_t len);
 static int	internal_flush(void);
+static void pq_set_nonblocking(bool nonblocking);
 
 #ifdef HAVE_UNIX_SOCKETS
 static int	Lock_AF_UNIX(unsigned short portNumber, char *unixSocketName);
@@ -142,7 +152,9 @@ static int	Setup_AF_UNIX(void);
 void
 pq_init(void)
 {
-	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
+	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
+	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
+	PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
 	PqCommBusy = false;
 	DoingCopyOut = false;
 	on_proc_exit(pq_close, 0);
@@ -194,6 +206,7 @@ pq_close(int code, Datum arg)
 #endif   /* ENABLE_GSS || ENABLE_SSPI */
 
 		/* Cleanly shut down SSL layer */
+		pq_set_nonblocking(false); /* XXX: Is this required? */
 		secure_close(MyProcPort);
 
 		/*
@@ -732,6 +745,37 @@ TouchSocketFile(void)
  * --------------------------------
  */
 
+/* --------------------------------
+ *            pq_set_nonblocking - set socket blocking/non-blocking
+ *
+ * Sets the socket non-blocking if nonblocking is TRUE, or sets it
+ * blocking otherwise.
+ * --------------------------------
+ */
+static void
+pq_set_nonblocking(bool nonblocking)
+{
+	if (MyProcPort->noblock == nonblocking)
+		return;
+
+#ifdef WIN32
+	pgwin32_noblock = nonblocking ? 1 : 0;
+#else
+	if (nonblocking)
+	{
+		if (!pg_set_noblock(MyProcPort->sock))
+			ereport(ERROR,
+					(errmsg("could not set socket to non-blocking mode: %m")));
+	}
+	else
+	{
+		if (!pg_set_block(MyProcPort->sock))
+			ereport(ERROR,
+					(errmsg("could not set socket to blocking mode: %m")));
+	}
+#endif
+	MyProcPort->noblock = nonblocking;
+}
 
 /* --------------------------------
  *		pq_recvbuf - load some bytes into the input buffer
@@ -756,13 +800,15 @@ pq_recvbuf(void)
 			PqRecvLength = PqRecvPointer = 0;
 	}
 
+	pq_set_nonblocking(false);
+
 	/* Can fill buffer from PqRecvLength and upwards */
 	for (;;)
 	{
 		int			r;
 
 		r = secure_read(MyProcPort, PqRecvBuffer + PqRecvLength,
-						PQ_BUFFER_SIZE - PqRecvLength);
+						PQ_RECV_BUFFER_SIZE - PqRecvLength);
 
 		if (r < 0)
 		{
@@ -825,7 +871,6 @@ pq_peekbyte(void)
 	return (unsigned char) PqRecvBuffer[PqRecvPointer];
 }
 
-
 /* --------------------------------
  *		pq_getbyte_if_available - get a single byte from connection,
  *			if available
@@ -845,72 +890,38 @@ pq_getbyte_if_available(unsigned char *c)
 		return 1;
 	}
 
-	/* Temporarily put the socket into non-blocking mode */
-#ifdef WIN32
-	pgwin32_noblock = 1;
-#else
-	if (!pg_set_noblock(MyProcPort->sock))
-		ereport(ERROR,
-				(errmsg("could not set socket to non-blocking mode: %m")));
-#endif
-	MyProcPort->noblock = true;
-	PG_TRY();
+	/* Put the socket into non-blocking mode */
+	pq_set_nonblocking(true);
+
+	r = secure_read(MyProcPort, c, 1);
+	if (r < 0)
 	{
-		r = secure_read(MyProcPort, c, 1);
-		if (r < 0)
+		/*
+		 * Ok if no data available without blocking or interrupted (though
+		 * EINTR really shouldn't happen with a non-blocking socket).
+		 * Report other errors.
+		 */
+		if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
+			r = 0;
+		else
 		{
 			/*
-			 * Ok if no data available without blocking or interrupted (though
-			 * EINTR really shouldn't happen with a non-blocking socket).
-			 * Report other errors.
+			 * Careful: an ereport() that tries to write to the client
+			 * would cause recursion to here, leading to stack overflow
+			 * and core dump!  This message must go *only* to the
+			 * postmaster log.
 			 */
-			if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
-				r = 0;
-			else
-			{
-				/*
-				 * Careful: an ereport() that tries to write to the client
-				 * would cause recursion to here, leading to stack overflow
-				 * and core dump!  This message must go *only* to the
-				 * postmaster log.
-				 */
-				ereport(COMMERROR,
-						(errcode_for_socket_access(),
-						 errmsg("could not receive data from client: %m")));
-				r = EOF;
-			}
-		}
-		else if (r == 0)
-		{
-			/* EOF detected */
+			ereport(COMMERROR,
+					(errcode_for_socket_access(),
+					 errmsg("could not receive data from client: %m")));
 			r = EOF;
 		}
 	}
-	PG_CATCH();
+	else if (r == 0)
 	{
-		/*
-		 * The rest of the backend code assumes the socket is in blocking
-		 * mode, so treat failure as FATAL.
-		 */
-#ifdef WIN32
-		pgwin32_noblock = 0;
-#else
-		if (!pg_set_block(MyProcPort->sock))
-			ereport(FATAL,
-					(errmsg("could not set socket to blocking mode: %m")));
-#endif
-		MyProcPort->noblock = false;
-		PG_RE_THROW();
+		/* EOF detected */
+		r = EOF;
 	}
-	PG_END_TRY();
-#ifdef WIN32
-	pgwin32_noblock = 0;
-#else
-	if (!pg_set_block(MyProcPort->sock))
-		ereport(FATAL,
-				(errmsg("could not set socket to blocking mode: %m")));
-#endif
-	MyProcPort->noblock = false;
 
 	return r;
 }
@@ -1138,10 +1149,13 @@ internal_putbytes(const char *s, size_t len)
 	while (len > 0)
 	{
 		/* If buffer is full, then flush it out */
-		if (PqSendPointer >= PQ_BUFFER_SIZE)
+		if (PqSendPointer >= PqSendBufferSize)
+		{
+			pq_set_nonblocking(false);
 			if (internal_flush())
 				return EOF;
-		amount = PQ_BUFFER_SIZE - PqSendPointer;
+		}
+		amount = PqSendBufferSize - PqSendPointer;
 		if (amount > len)
 			amount = len;
 		memcpy(PqSendBuffer + PqSendPointer, s, amount);
@@ -1167,17 +1181,25 @@ pq_flush(void)
 	if (PqCommBusy)
 		return 0;
 	PqCommBusy = true;
+	pq_set_nonblocking(false);
 	res = internal_flush();
 	PqCommBusy = false;
 	return res;
 }
 
+/* --------------------------------
+ *		internal_flush - flush pending output
+ *
+ * Returns 0 if OK (meaning everything was sent, or operation would block
+ * and the socket is in non-blocking mode), or EOF if trouble.
+ * --------------------------------
+ */
 static int
 internal_flush(void)
 {
 	static int	last_reported_send_errno = 0;
 
-	char	   *bufptr = PqSendBuffer;
+	char	   *bufptr = PqSendBuffer + PqSendStart;
 	char	   *bufend = PqSendBuffer + PqSendPointer;
 
 	while (bufptr < bufend)
@@ -1192,6 +1214,16 @@ internal_flush(void)
 				continue;		/* Ok if we were interrupted */
 
 			/*
+			 * Ok if no data writable without blocking, and the socket
+			 * is in non-blocking mode.
+			 */
+			if (errno == EAGAIN ||
+				errno == EWOULDBLOCK)
+			{
+				return 0;
+			}
+
+			/*
 			 * Careful: an ereport() that tries to write to the client would
 			 * cause recursion to here, leading to stack overflow and core
 			 * dump!  This message must go *only* to the postmaster log.
@@ -1212,18 +1244,56 @@ internal_flush(void)
 			 * We drop the buffered data anyway so that processing can
 			 * continue, even though we'll probably quit soon.
 			 */
-			PqSendPointer = 0;
+			PqSendStart = PqSendPointer = 0;
 			return EOF;
 		}
 
 		last_reported_send_errno = 0;	/* reset after any successful send */
 		bufptr += r;
+		PqSendStart += r;
 	}
 
-	PqSendPointer = 0;
+	PqSendStart = PqSendPointer = 0;
 	return 0;
 }
 
+/* --------------------------------
+ *		pq_flush_if_writable - flush pending output if writable without blocking
+ *
+ * Returns 0 if OK, or EOF if trouble.
+ * --------------------------------
+ */
+int
+pq_flush_if_writable(void)
+{
+	int			res;
+
+	/* Quick exit if nothing to do */
+	if (PqSendPointer == PqSendStart)
+		return 0;
+
+	/* No-op if reentrant call */
+	if (PqCommBusy)
+		return 0;
+
+	/* Temporarily put the socket into non-blocking mode */
+	pq_set_nonblocking(true);
+
+	PqCommBusy = true;
+	res = internal_flush();
+	PqCommBusy = false;
+	return res;
+}
+
+/* --------------------------------
+ *		pq_is_send_pending	- is there any pending data in the output buffer?
+ * --------------------------------
+ */
+bool
+pq_is_send_pending(void)
+{
+	return (PqSendStart < PqSendPointer);
+}
 
 /* --------------------------------
  * Message-level I/O routines begin here.
@@ -1286,6 +1356,25 @@ fail:
 }
 
 /* --------------------------------
+ *		pq_putmessage_noblock	- like pq_putmessage, but never blocks
+ *
+ *		If the output buffer is too small to hold the message, the buffer
+ *		is enlarged.
+ */
+int
+pq_putmessage_noblock(char msgtype, const char *s, size_t len)
+{
+	int required = PqSendPointer + len + 5 ;
+	if (required > PqSendBufferSize)
+	{
+		PqSendBuffer = repalloc(PqSendBuffer, required);
+		PqSendBufferSize = required;
+	}
+	return pq_putmessage(msgtype, s, len);
+}
+
+
+/* --------------------------------
  *		pq_startcopyout - inform libpq that an old-style COPY OUT transfer
  *			is beginning
  * --------------------------------
diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index a4f559e..32d0cb5 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -193,19 +193,21 @@ DisownLatch(volatile Latch *latch)
 bool
 WaitLatch(volatile Latch *latch, long timeout)
 {
-	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
+	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
 }
 
 /*
  * Like WaitLatch, but will also return when there's data available in
- * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
- * was set, or 2 if the scoket became readable.
+ * 'sock' for reading or writing. Returns 0 if timeout was reached,
+ * 1 if the latch was set, 2 if the socket became readable or writable.
  */
 int
-WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
+WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
+				  bool forWrite, long timeout)
 {
 	struct timeval tv, *tvp = NULL;
 	fd_set		input_mask;
+	fd_set		output_mask;
 	int			rc;
 	int			result = 0;
 
@@ -241,14 +243,22 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
 		FD_ZERO(&input_mask);
 		FD_SET(selfpipe_readfd, &input_mask);
 		hifd = selfpipe_readfd;
-		if (sock != PGINVALID_SOCKET)
+		if (sock != PGINVALID_SOCKET && forRead)
 		{
 			FD_SET(sock, &input_mask);
 			if (sock > hifd)
 				hifd = sock;
 		}
 
-		rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
+		FD_ZERO(&output_mask);
+		if (sock != PGINVALID_SOCKET && forWrite)
+		{
+			FD_SET(sock, &output_mask);
+			if (sock > hifd)
+				hifd = sock;
+		}
+
+		rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
 		if (rc < 0)
 		{
 			if (errno == EINTR)
@@ -263,7 +273,9 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
 			result = 0;
 			break;
 		}
-		if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
+		if (sock != PGINVALID_SOCKET &&
+			((forRead && FD_ISSET(sock, &input_mask)) ||
+			 (forWrite && FD_ISSET(sock, &output_mask))))
 		{
 			result = 2;
 			break;		/* data available in socket */
diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c
index 76dd6be..dbbd4a3 100644
--- a/src/backend/port/win32/socket.c
+++ b/src/backend/port/win32/socket.c
@@ -14,7 +14,8 @@
 #include "postgres.h"
 
 /*
- * Indicate if pgwin32_recv() should operate in non-blocking mode.
+ * Indicate if pgwin32_recv() and pgwin32_send() should operate
+ * in non-blocking mode.
  *
  * Since the socket emulation layer always sets the actual socket to
  * non-blocking mode in order to be able to deliver signals, we must
@@ -399,6 +400,16 @@ pgwin32_send(SOCKET s, char *buf, int len, int flags)
 			return -1;
 		}
 
+		if (pgwin32_noblock)
+		{
+			/*
+			 * No data sent, and we are in "emulated non-blocking mode", so
+			 * return indicating that we'd block if we were to continue.
+			 */
+			errno = EWOULDBLOCK;
+			return -1;
+		}
+
 		/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
 
 		if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
diff --git a/src/backend/port/win32_latch.c b/src/backend/port/win32_latch.c
index ac20c49..f42cfef 100644
--- a/src/backend/port/win32_latch.c
+++ b/src/backend/port/win32_latch.c
@@ -85,11 +85,12 @@ DisownLatch(volatile Latch *latch)
 bool
 WaitLatch(volatile Latch *latch, long timeout)
 {
-	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
+	return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
 }
 
 int
-WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
+WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
+				  bool forWrite, long timeout)
 {
 	DWORD		rc;
 	HANDLE		events[3];
@@ -103,10 +104,17 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 	events[0] = latchevent;
 	events[1] = pgwin32_signal_event;
 	numevents = 2;
-	if (sock != PGINVALID_SOCKET)
+	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
 	{
+		int		flags = 0;
+
+		if (forRead)
+			flags |= FD_READ;
+		if (forWrite)
+			flags |= FD_WRITE;
+
 		sockevent = WSACreateEvent();
-		WSAEventSelect(sock, sockevent, FD_READ);
+		WSAEventSelect(sock, sockevent, flags);
 		events[numevents++] = sockevent;
 	}
 
@@ -139,8 +147,18 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 			pgwin32_dispatch_queued_signals();
 		else if (rc == WAIT_OBJECT_0 + 2)
 		{
+			WSANETWORKEVENTS resEvents;
+
 			Assert(sock != PGINVALID_SOCKET);
-			result = 2;
+
+			ZeroMemory(&resEvents, sizeof(resEvents));
+			if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
+				ereport(FATAL,
+						(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
+
+			if ((forRead && resEvents.lNetworkEvents & FD_READ) ||
+				(forWrite && resEvents.lNetworkEvents & FD_WRITE))
+				result = 2;
 			break;
 		}
 		else if (rc != WAIT_OBJECT_0)
@@ -148,7 +166,7 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
 	}
 
 	/* Clean up the handle we created for the socket */
-		if (sock != PGINVALID_SOCKET)
+	if (sock != PGINVALID_SOCKET && (forRead || forWrite))
 	{
 		WSAEventSelect(sock, sockevent, 0);
 		WSACloseEvent(sockevent);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f76b5b0..36406d2 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -74,6 +74,7 @@ bool		am_walsender = false;		/* Am I a walsender process ? */
 /* User-settable parameters for walsender */
 int			max_wal_senders = 0;	/* the maximum number of concurrent walsenders */
 int			WalSndDelay = 1000;	/* max sleep time between some actions */
+int			replication_timeout = 60 * 1000;	/* maximum time to send one WAL data message */
 
 /*
  * These variables are used similarly to openLogFile/Id/Seg/Off,
@@ -95,6 +96,11 @@ static XLogRecPtr sentPtr = {0, 0};
  */
 static StringInfoData reply_message;
 
+/*
+ * Timestamp of the last receipt of the reply from the standby.
+ */
+static TimestampTz last_reply_timestamp;
+
 /* Flags set by signal handlers for later service in main loop */
 static volatile sig_atomic_t got_SIGHUP = false;
 volatile sig_atomic_t walsender_shutdown_requested = false;
@@ -113,7 +119,7 @@ static int	WalSndLoop(void);
 static void InitWalSnd(void);
 static void WalSndHandshake(void);
 static void WalSndKill(int code, Datum arg);
-static bool XLogSend(char *msgbuf, bool *caughtup);
+static void XLogSend(char *msgbuf, bool *caughtup);
 static void IdentifySystem(void);
 static void StartReplication(StartReplicationCmd * cmd);
 static void ProcessStandbyMessage(void);
@@ -469,6 +475,7 @@ ProcessRepliesIfAny(void)
 {
 	unsigned char firstchar;
 	int			r;
+	int		received = false;
 
 	for (;;)
 	{
@@ -484,7 +491,7 @@ ProcessRepliesIfAny(void)
 		if (r == 0)
 		{
 			/* no data available without blocking */
-			return;
+			break;
 		}
 
 		/* Handle the very limited subset of commands expected in this phase */
@@ -495,6 +502,7 @@ ProcessRepliesIfAny(void)
 				 */
 			case 'd':
 				ProcessStandbyMessage();
+				received = true;
 				break;
 
 				/*
@@ -510,6 +518,12 @@ ProcessRepliesIfAny(void)
 								firstchar)));
 		}
 	}
+	/*
+	 * Save the last reply timestamp if we've received at least
+	 * one reply.
+	 */
+	if (received)
+		last_reply_timestamp = GetCurrentTimestamp();
 }
 
 /*
@@ -688,6 +702,9 @@ WalSndLoop(void)
 	 */
 	initStringInfo(&reply_message);
 
+	/* Initialize the last reply timestamp */
+	last_reply_timestamp = GetCurrentTimestamp();
+
 	/* Loop forever, unless we get an error */
 	for (;;)
 	{
@@ -706,19 +723,6 @@ WalSndLoop(void)
 			SyncRepInitConfig();
 		}
 
-		/*
-		 * When SIGUSR2 arrives, we send all outstanding logs up to the
-		 * shutdown checkpoint record (i.e., the latest record) and exit.
-		 */
-		if (walsender_ready_to_stop)
-		{
-			if (!XLogSend(output_message, &caughtup))
-				break;
-			ProcessRepliesIfAny();
-			if (caughtup)
-				walsender_shutdown_requested = true;
-		}
-
 		/* Normal exit from the walsender is here */
 		if (walsender_shutdown_requested)
 		{
@@ -730,11 +734,13 @@ WalSndLoop(void)
 		}
 
 		/*
-		 * If we had sent all accumulated WAL in last round, nap for the
-		 * configured time before retrying.
+		 * If we don't have any pending data in the output buffer, try to
+		 * send some more.
 		 */
-		if (caughtup)
+		if (!pq_is_send_pending())
 		{
+			XLogSend(output_message, &caughtup);
+
 			/*
 			 * Even if we wrote all the WAL that was available when we started
 			 * sending, more might have arrived while we were sending this
@@ -742,28 +748,79 @@ WalSndLoop(void)
 			 * received any signals from that time. Let's arm the latch
 			 * again, and after that check that we're still up-to-date.
 			 */
-			ResetLatch(&MyWalSnd->latch);
-
-			if (!XLogSend(output_message, &caughtup))
-				break;
-			if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
+			if (caughtup && !pq_is_send_pending())
 			{
-				/*
-				 * XXX: We don't really need the periodic wakeups anymore,
-				 * WaitLatchOrSocket should reliably wake up as soon as
-				 * something interesting happens.
-				 */
+				ResetLatch(&MyWalSnd->latch);
 
-				/* Sleep */
-				WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
-								  WalSndDelay * 1000L);
+				XLogSend(output_message, &caughtup);
 			}
 		}
-		else
+
+		/* Flush pending output to the client */
+		if (pq_flush_if_writable() != 0)
+			break;
+
+		/*
+		 * When SIGUSR2 arrives, we send any outstanding logs up to the
+		 * shutdown checkpoint record (i.e., the latest record) and exit.
+		 */
+		if (walsender_ready_to_stop && !pq_is_send_pending())
 		{
-			/* Attempt to send the log once every loop */
-			if (!XLogSend(output_message, &caughtup))
+			XLogSend(output_message, &caughtup);
+			ProcessRepliesIfAny();
+			if (caughtup && !pq_is_send_pending())
+				walsender_shutdown_requested = true;
+		}
+
+		if ((caughtup || pq_is_send_pending()) &&
+			!got_SIGHUP &&
+			!walsender_shutdown_requested)
+		{
+			TimestampTz	finish_time;
+			long		sleeptime;
+
+			/* Reschedule replication timeout */
+			if (replication_timeout > 0)
+			{
+				long		secs;
+				int		usecs;
+
+				finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
+														  replication_timeout);
+				TimestampDifference(GetCurrentTimestamp(),
+									finish_time, &secs, &usecs);
+				sleeptime = secs * 1000 + usecs / 1000;
+				if (WalSndDelay < sleeptime)
+					sleeptime = WalSndDelay;
+			}
+			else
+			{
+				/*
+				 * XXX: Without timeout, we don't really need the periodic
+				 * wakeups anymore, WaitLatchOrSocket should reliably wake up
+				 * as soon as something interesting happens.
+				 */
+				sleeptime = WalSndDelay;
+			}
+
+			/* Sleep */
+			WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
+							  true, pq_is_send_pending(),
+							  sleeptime * 1000L);
+
+			/* Check for replication timeout */
+			if (replication_timeout > 0 &&
+				GetCurrentTimestamp() >= finish_time)
+			{
+				/*
+				 * Since typically expiration of replication timeout means
+				 * communication problem, we don't send the error message
+				 * to the standby.
+				 */
+				ereport(COMMERROR,
+						(errmsg("terminating walsender process due to replication timeout")));
 				break;
+			}
 		}
 
 		/*
@@ -993,7 +1050,8 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
 
 /*
  * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and send it.
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
  *
  * msgbuf is a work area in which the output message is constructed.  It's
  * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
@@ -1001,10 +1059,9 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
  *
  * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
  * *caughtup is set to false.
- *
- * Returns true if OK, false if trouble.
+
  */
-static bool
+static void
 XLogSend(char *msgbuf, bool *caughtup)
 {
 	XLogRecPtr	SendRqstPtr;
@@ -1027,7 +1084,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 	if (XLByteLE(SendRqstPtr, sentPtr))
 	{
 		*caughtup = true;
-		return true;
+		return;
 	}
 
 	/*
@@ -1099,11 +1156,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 
 	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
 
-	pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
-
-	/* Flush pending output to the client */
-	if (pq_flush())
-		return false;
+	pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
 
 	sentPtr = endptr;
 
@@ -1127,7 +1180,7 @@ XLogSend(char *msgbuf, bool *caughtup)
 		set_ps_display(activitymsg, false);
 	}
 
-	return true;
+	return;
 }
 
 /* SIGHUP: set flag to re-read config file at next convenient time */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9ca1329..b49bdae 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1856,6 +1856,16 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
+		{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
+			gettext_noop("Sets the maximum time to wait for WAL replication."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&replication_timeout,
+		60 * 1000, 0, INT_MAX, NULL, NULL
+	},
+
+	{
 		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
 			gettext_noop("Sets the delay in microseconds between transaction commit and "
 						 "flushing WAL to disk."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ed70223..4348185 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,6 +200,7 @@
 #wal_sender_delay = 1s		# walsender cycle time, 1-10000 milliseconds
 #wal_keep_segments = 0		# in logfile segments, 16MB each; 0 disables
 #vacuum_defer_cleanup_age = 0	# number of xacts by which cleanup is delayed
+#replication_timeout = 60s # in milliseconds, 0 is disabled
 
 # - Standby Servers -
 
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 8ecab6d..b20b0c2 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -60,7 +60,10 @@ extern int	pq_peekbyte(void);
 extern int	pq_getbyte_if_available(unsigned char *c);
 extern int	pq_putbytes(const char *s, size_t len);
 extern int	pq_flush(void);
+extern int	pq_flush_if_writable(void);
+extern bool	pq_is_send_pending(void);
 extern int	pq_putmessage(char msgtype, const char *s, size_t len);
+extern int	pq_putmessage_noblock(char msgtype, const char *s, size_t len);
 extern void pq_startcopyout(void);
 extern void pq_endcopyout(bool errorAbort);
 
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 150a71f..2670a2e 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -98,6 +98,7 @@ extern volatile sig_atomic_t walsender_ready_to_stop;
 /* user-settable parameters */
 extern int	WalSndDelay;
 extern int	max_wal_senders;
+extern int	replication_timeout;
 
 extern int	WalSenderMain(void);
 extern void WalSndSignals(void);
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 31744ff..f64e13b 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -40,7 +40,7 @@ extern void OwnLatch(volatile Latch *latch);
 extern void DisownLatch(volatile Latch *latch);
 extern bool WaitLatch(volatile Latch *latch, long timeout);
 extern int	WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
-				  long timeout);
+				  bool forRead, bool forWrite, long timeout);
 extern void SetLatch(volatile Latch *latch);
 extern void ResetLatch(volatile Latch *latch);
 #define TestLatch(latch) (((volatile Latch *) latch)->is_set)
#37Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#36)
Re: Replication server timeout patch

On Mon, Mar 28, 2011 at 7:49 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

pq_flush_if_writable() calls internal_flush() without using PG_TRY block.
This seems unsafe because for example pgwin32_waitforsinglesocket()
called by secure_write() can throw ERROR.

Perhaps it's time to give up on the assumption that the socket is in
blocking mode except within those two functions. Attached patch adds the
pq_set_nonblocking() function from your patch, and adds calls to it before
all secure_read/write operations to put the socket in the right mode.
There's only a few of those operations.

Sounds good.

+ pq_set_nonblocking(false); /* XXX: Is this required? */

No. Since secure_close and close_SSL don't use MyProcPort->sock and
MyProcPort->noblock which can be changed in pq_set_nonblocking,
I don't think that is required.

+ pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);

Don't we need to check the return value of pq_putmessage_noblock? That
can return EOF when trouble happens (for example the send system call fails).

Should we use COMMERROR instead of ERROR if we fail to put the socket in the
right mode?

Maybe.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#38Tom Lane
tgl@sss.pgh.pa.us
In reply to: Fujii Masao (#37)
Re: Replication server timeout patch

Fujii Masao <masao.fujii@gmail.com> writes:

On Mon, Mar 28, 2011 at 7:49 PM, Heikki Linnakangas

Should we use COMMERROR instead of ERROR if we fail to put the socket in the
right mode?

Maybe.

COMMERROR exists to keep us from trying to send an error report down a
failed socket. I would assume (perhaps wrongly) that
walsender/walreceiver don't try to push error reports across the socket
anyway, only to the postmaster log. If correct, there is no need for
COMMERROR, and using it just muddies the code.

regards, tom lane

#39Robert Haas
robertmhaas@gmail.com
In reply to: Tom Lane (#38)
Re: Replication server timeout patch

On Tue, Mar 29, 2011 at 9:24 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Fujii Masao <masao.fujii@gmail.com> writes:

On Mon, Mar 28, 2011 at 7:49 PM, Heikki Linnakangas

Should we use COMMERROR instead of ERROR if we fail to put the socket in the
right mode?

Maybe.

COMMERROR exists to keep us from trying to send an error report down a
failed socket.  I would assume (perhaps wrongly) that
walsender/walreceiver don't try to push error reports across the socket
anyway, only to the postmaster log.  If correct, there is no need for
COMMERROR, and using it just muddies the code.

I don't think that's how it works. The error the server sends is
copied into some of the messages in the client log, which is really
useful for debugging.

ERROR: can't connect to the server (server said: you're not authorized)

...or something like that.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#40Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#39)
Re: Replication server timeout patch

On Wed, Mar 30, 2011 at 1:04 AM, Robert Haas <robertmhaas@gmail.com> wrote:

COMMERROR exists to keep us from trying to send an error report down a
failed socket.  I would assume (perhaps wrongly) that
walsender/walreceiver don't try to push error reports across the socket
anyway, only to the postmaster log.  If correct, there is no need for
COMMERROR, and using it just muddies the code.

I don't think that's how it works.  The error the server sends is
copied into some of the messages in the client log, which is really
useful for debugging.

ERROR: can't connect to the server (server said: you're not authorized)

...or something like that.

Yes. Walsender sends its error message to walreceiver, and walreceiver
writes it down to the server log. For example;

FATAL: could not receive data from WAL stream: FATAL: requested WAL
segment 000000010000000000000016 has already been removed

The second FATAL message is sent from walsender.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#41Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#37)
Re: Replication server timeout patch

On 29.03.2011 07:55, Fujii Masao wrote:

On Mon, Mar 28, 2011 at 7:49 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

pq_flush_if_writable() calls internal_flush() without using PG_TRY block.
This seems unsafe because for example pgwin32_waitforsinglesocket()
called by secure_write() can throw ERROR.

Perhaps it's time to give up on the assumption that the socket is in
blocking mode except within those two functions. Attached patch adds the
pq_set_nonblocking() function from your patch, and adds calls to it before
all secure_read/write operations to put the socket in the right mode.
There's only a few of those operations.

Sounds good.

+ pq_set_nonblocking(false); /* XXX: Is this required? */

No. Since secure_close and close_SSL don't use MyProcPort->sock and
MyProcPort->noblock which can be changed in pq_set_nonblocking,
I don't think that is required.

Ok, I took that out.

+ pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);

Don't we need to check the return value of pq_putmessage_noblock? That
can return EOF when trouble happens (for example the send system call fails).

No, pq_putmessage_noblock doesn't call send() because it enlarges the
buffer to make sure the message fits, and it doesn't anything else that
could fail else. I changed its return type to void, and added an
Assert() to check that the pq_putmessage() call it does internally
indeed doesn't fail.

Should we use COMMERROR instead of ERROR if we fail to put the socket in the
right mode?

Maybe.

I made it COMMERROR. ERRORs are sent to the client, and you could get
into infinite recursion if sending the ERROR requires setting the
blocking mode again.

Committed with those changes. I also reworded the docs a bit.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#42Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#41)
Re: Replication server timeout patch

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

+       pq_putmessage_noblock('d', msgbuf, 1 +
sizeof(WalDataMessageHeader) + nbytes);

Don't we need to check the return value of pq_putmessage_noblock? That
can return EOF when trouble happens (for example the send system call
fails).

No, pq_putmessage_noblock doesn't call send() because it enlarges the buffer
to make sure the message fits, and it doesn't anything else that could fail
else. I changed its return type to void, and added an Assert() to check that
the pq_putmessage() call it does internally indeed doesn't fail.

Oh, you're right.

Committed with those changes. I also reworded the docs a bit.

Thanks a lot!

+ A value of zero means wait forever. This parameter can only be set in

The first sentence sounds misleading. Even if you set the parameter to zero,
replication connections can be terminated because of keepalive or socket error.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#43Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#42)
Re: Replication server timeout patch

On 30.03.2011 10:58, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
+ A value of zero means wait forever. This parameter can only be set in

The first sentence sounds misleading. Even if you set the parameter to zero,
replication connections can be terminated because of keepalive or socket error.

Hmm, should I change it back to "A value of zero disables the timeout" ?
Any better suggestions?

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#44Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#43)
Re: Replication server timeout patch

On Wed, Mar 30, 2011 at 5:03 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 30.03.2011 10:58, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com>  wrote:
+        A value of zero means wait forever.  This parameter can only be
set in

The first sentence sounds misleading. Even if you set the parameter to
zero,
replication connections can be terminated because of keepalive or socket
error.

Hmm, should I change it back to "A value of zero disables the timeout" ? Any
better suggestions?

I like that. But I appreciate if anyone suggests the better.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#45Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#44)
Re: Replication server timeout patch

On Wed, Mar 30, 2011 at 4:08 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Wed, Mar 30, 2011 at 5:03 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 30.03.2011 10:58, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com>  wrote:
+        A value of zero means wait forever.  This parameter can only be
set in

The first sentence sounds misleading. Even if you set the parameter to
zero,
replication connections can be terminated because of keepalive or socket
error.

Hmm, should I change it back to "A value of zero disables the timeout" ? Any
better suggestions?

I like that. But I appreciate if anyone suggests the better.

Maybe sticking the word "mechanism" in there would be a bit better.
"A value of zero disables the timeout mechanism"?

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#46Fujii Masao
masao.fujii@gmail.com
In reply to: Robert Haas (#45)
Re: Replication server timeout patch

On Wed, Mar 30, 2011 at 10:54 PM, Robert Haas <robertmhaas@gmail.com> wrote:

On Wed, Mar 30, 2011 at 4:08 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Wed, Mar 30, 2011 at 5:03 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 30.03.2011 10:58, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com>  wrote:
+        A value of zero means wait forever.  This parameter can only be
set in

The first sentence sounds misleading. Even if you set the parameter to
zero,
replication connections can be terminated because of keepalive or socket
error.

Hmm, should I change it back to "A value of zero disables the timeout" ? Any
better suggestions?

I like that. But I appreciate if anyone suggests the better.

Maybe sticking the word "mechanism" in there would be a bit better.
"A value of zero disables the timeout mechanism"?

I'm OK with that. Or, what about "A value of zero turns this off" which is
used in statement_timeout for the sake of consistency?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#47Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#46)
Re: Replication server timeout patch

On 31.03.2011 05:46, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 10:54 PM, Robert Haas<robertmhaas@gmail.com> wrote:

On Wed, Mar 30, 2011 at 4:08 AM, Fujii Masao<masao.fujii@gmail.com> wrote:

On Wed, Mar 30, 2011 at 5:03 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 30.03.2011 10:58, Fujii Masao wrote:

On Wed, Mar 30, 2011 at 4:24 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
+ A value of zero means wait forever. This parameter can only be
set in

The first sentence sounds misleading. Even if you set the parameter to
zero,
replication connections can be terminated because of keepalive or socket
error.

Hmm, should I change it back to "A value of zero disables the timeout" ? Any
better suggestions?

I like that. But I appreciate if anyone suggests the better.

Maybe sticking the word "mechanism" in there would be a bit better.
"A value of zero disables the timeout mechanism"?

I'm OK with that. Or, what about "A value of zero turns this off" which is
used in statement_timeout for the sake of consistency?

Committed Robert's suggestion.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com