Add client connection check during the execution of the query
This patch adds verification of the connection with the client during
the execution of the SQL query. The feature enables using the GUC
variable ‘client_connection_check_interval’. The default check interval
is 1 second. If you set the value of ‘client_connection_check_interval’
to 0, then the check will not be performed.
The feature will be useful in cases when, during the execution of a very
long query, the client suddenly terminates the connection - this will
allow backend to cancel further execution of the query and free server
resources.
Attachments:
Add_client_connection_check.patchtext/x-diff; name=Add_client_connection_check.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index c4effa034c..83c662de0f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7677,6 +7677,27 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ This parameter sets a time interval (in milliseconds) between periodic
+ verification of the connection with the client during the execution
+ of the query. In case when the client aborts the connection,
+ the execution of the query will be terminated.
+ </para>
+ <para>
+ If value is -1, then this option is disabled, and the backend will
+ detect client disconnection only when trying to send him a response
+ to the query. Zero selects a suitable default value (1 second).
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
</sect1>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 0c9593d4cc..dd1917efe1 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -120,6 +120,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -1926,3 +1927,25 @@ pq_setkeepalivescount(int count, Port *port)
return STATUS_OK;
}
+
+bool pq_is_client_connected(void)
+{
+ CheckClientConnectionPending = false;
+ if (IsUnderPostmaster &&
+ MyProcPort != NULL && !PqCommReadingMsg && !PqCommBusy)
+ {
+ char nextbyte;
+ int r;
+
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK | MSG_DONTWAIT);
+
+ if (r == 0 || (r == -1 &&
+ errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR))
+ {
+ ClientConnectionLost = true;
+ InterruptPending = true;
+ return false;
+ }
+ }
+ return true;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index a3b9757565..2243b672ef 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3017,6 +3017,13 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (client_connection_check_interval > 0 && CheckClientConnectionPending)
+ {
+ if (pq_is_client_connected())
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
+
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
@@ -4195,6 +4202,9 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
/*
* (5) turn off the idle-in-transaction timeout
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index c6939779b9..96d44a15e8 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t ConfigReloadPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 4f1d2a0d28..1dfdfe8b12 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -33,6 +33,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -72,6 +73,7 @@ static void ShutdownPostgres(int code, Datum arg);
static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -628,6 +630,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler);
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
+ RegisterTimeout(SKIP_CLIENT_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1239,6 +1242,13 @@ IdleInTransactionSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 514595699b..11c6d5bf71 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3072,6 +3072,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 1000, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ab063dae41..40a7610a77 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -623,6 +623,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 1000 # set time interval between
+ # connection checks, in ms
+ # 0 is disabled
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index c7762f68a6..0bf681500e 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putbytes(const char *s, size_t len);
+extern bool pq_is_client_connected(void);
/*
* prototypes for functions in be-secure.c
@@ -102,6 +103,7 @@ extern WaitEventSet *FeBeWaitSet;
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
/*
* prototypes for functions in be-secure-common.c
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index d6b32c070c..8c828059fc 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -83,7 +83,7 @@ extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
-
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index dcc7307c16..e19a3976e3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -31,6 +31,7 @@ typedef enum TimeoutId
STANDBY_TIMEOUT,
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
+ SKIP_CLIENT_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 19d60a506e..526a5149cf 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ connection \
dummy_seclabel \
snapshot_too_old \
test_bloomfilter \
diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile
new file mode 100644
index 0000000000..2ec706fd56
--- /dev/null
+++ b/src/test/modules/connection/Makefile
@@ -0,0 +1,14 @@
+# src/test/modules/connection/Makefile
+
+subdir = src/test/modules/connection
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean maintainer-clean:
+ rm -rf tmp_check
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
new file mode 100644
index 0000000000..9d1e7d5990
--- /dev/null
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -0,0 +1,61 @@
+# Check if backend stopped after client disconnection
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 2;
+
+my $long_query = q{
+DO
+$$
+DECLARE row_data RECORD;
+BEGIN
+EXECUTE 'CREATE TABLE IF NOT EXISTS keep_alive_test AS SELECT generate_series(0,100000) AS tt';
+FOR row_data IN
+ SELECT tt
+ FROM keep_alive_test
+LOOP
+ EXECUTE 'SELECT count(*) FROM keep_alive_test';
+END LOOP;
+END$$;
+};
+my $set_guc = q{
+ SET client_connection_check_interval = 1000;
+};
+
+my $node = get_new_node('node');
+my ($pid, $timed_out);
+$node->init;
+$node->start;
+
+#########################################################
+# TEST 1: GUC client_connection_check_interval: enabled #
+#########################################################
+
+# Set GUC options, get backend pid and run a long time query
+$node->psql('postgres', "$set_guc SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+
+# Give time to the backend to detect client disconnected
+sleep 3;
+# Check if backend is still alive
+my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '0', 'Test: client_connection_check_interval enable');
+$node->stop;
+
+##########################################################
+# TEST 2: GUC client_connection_check_interval: disabled #
+##########################################################
+
+$node->start;
+$set_guc = q{
+ SET client_connection_check_interval = 0;
+};
+$node->psql('postgres', "$set_guc SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+# Give time to the client to disconnect
+sleep 3;
+# Check if backend is still alive
+$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '1', 'Test: client_connection_check_interval disable');
+$node->stop;
s.cherkashin@postgrespro.ru writes:
This patch adds verification of the connection with the client during
the execution of the SQL query. The feature enables using the GUC
variable ‘client_connection_check_interval’. The default check interval
is 1 second. If you set the value of ‘client_connection_check_interval’
to 0, then the check will not be performed.
I took a quick look through this.
* It won't even compile on non-Linux platforms, because MSG_DONTWAIT
is a Linux-ism. Perhaps that can be replaced by putting the client
socket into nonblock mode, but I'm not very sure that that'll work
(especially when using OpenSSL or another TLS implementation).
* I'm not convinced that this will reliably detect client connection loss.
AFAICS, if there is any unread data pending, it'd report that all is well
even if the client dropped off the net after sending that data. It's hard
to evaluate how likely such a situation is, but one really obvious case
is that the client might've sent an 'X' message to try to close the
connection gracefully. Also, use of TLS would again make things much
harder to reason about, because the TLS layer may send or receive data
that we don't know about.
* The management of the pending timeout interrupt seems like a mess.
Why did you involve ProcessInterrupts in that? It seems likely to queue
extra timeouts at random times due to unrelated interrupts causing that
bit of code to run, and/or cause weird gaps in the timeout intervals due
to not being run promptly. I'd be inclined to set this up so that the
timeout handler itself re-queues the timeout (I think that will work, or
if not, we should probably fix timeout.c so that it can).
* BTW, I am not on board with making this enabled-by-default.
This does seem like possibly a useful option if we can make it
work portably/reliably, but I don't have very high hopes for that.
regards, tom lane
Hi,
On 2019-01-13 18:05:39 -0500, Tom Lane wrote:
s.cherkashin@postgrespro.ru writes:
This patch adds verification of the connection with the client during
the execution of the SQL query. The feature enables using the GUC
variable ‘client_connection_check_interval’. The default check interval
is 1 second. If you set the value of ‘client_connection_check_interval’
to 0, then the check will not be performed.I took a quick look through this.
* It won't even compile on non-Linux platforms, because MSG_DONTWAIT
is a Linux-ism. Perhaps that can be replaced by putting the client
socket into nonblock mode, but I'm not very sure that that'll work
(especially when using OpenSSL or another TLS implementation).* I'm not convinced that this will reliably detect client connection loss.
AFAICS, if there is any unread data pending, it'd report that all is well
even if the client dropped off the net after sending that data. It's hard
to evaluate how likely such a situation is, but one really obvious case
is that the client might've sent an 'X' message to try to close the
connection gracefully. Also, use of TLS would again make things much
harder to reason about, because the TLS layer may send or receive data
that we don't know about.* The management of the pending timeout interrupt seems like a mess.
Why did you involve ProcessInterrupts in that? It seems likely to queue
extra timeouts at random times due to unrelated interrupts causing that
bit of code to run, and/or cause weird gaps in the timeout intervals due
to not being run promptly. I'd be inclined to set this up so that the
timeout handler itself re-queues the timeout (I think that will work, or
if not, we should probably fix timeout.c so that it can).* BTW, I am not on board with making this enabled-by-default.
This does seem like possibly a useful option if we can make it
work portably/reliably, but I don't have very high hopes for that.
Given that nothing happened since this message, and the commitfest is
ending, I'm going to mark this as returned with feedback.
Greetings,
Andres Freund
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client. In most cases, the
client must wait for a response from the server before sending new data
- which means there should not remain unread data on the socket and we
will be able to determine a broken connection.
Perhaps exceptions are possible, but I could not think of such a use
case (except COPY). I would be grateful if someone could offer such
cases or their solutions.
I added a test for the GUC variable when the client connects via SSL,
but I'm not sure that this test is really necessary.
Best regards,
Sergey Cherkashin.
Attachments:
Add_client_connection_check_v2.patchtext/x-diff; name=Add_client_connection_check_v2.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b6f5822b84..36d031df3c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8259,6 +8259,26 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets a time interval, in milliseconds, between periodic
+ verification of client-server connection during query execution.
+ If the client aborts the connection, the query is terminated.
+ </para>
+ <para>
+ Default value is <literal>zero</literal> - it disables connection
+ checks, so the backend will detect client disconnection only when trying
+ to send a response to the query.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
</sect1>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index c39617a430..7e43734845 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -120,6 +120,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -1926,3 +1927,33 @@ pq_setkeepalivescount(int count, Port *port)
return STATUS_OK;
}
+
+/* --------------------------------
+ * pq_check_client_connection - check if client connected to socket or not
+ * --------------------------------
+ */
+void pq_check_client_connection(void)
+{
+ CheckClientConnectionPending = false;
+ if (IsUnderPostmaster &&
+ MyProcPort != NULL && !PqCommReadingMsg && !PqCommBusy)
+ {
+ char nextbyte;
+ int r;
+
+#ifdef WIN32
+ pgwin32_noblock = 1;
+#endif
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
+#ifdef WIN32
+ pgwin32_noblock = 0;
+#endif
+
+ if (r == 0 || (r == -1 &&
+ errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR))
+ {
+ ClientConnectionLost = true;
+ InterruptPending = true;
+ }
+ }
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e773f20d9f..0230a0fdd0 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3030,6 +3030,8 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ pq_check_client_connection();
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
@@ -4208,6 +4210,9 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
/*
* (5) turn off the idle-in-transaction timeout
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index fd51934aaf..c15aef3793 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t ConfigReloadPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index c0b6231458..45bc8babbb 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -72,6 +73,7 @@ static void ShutdownPostgres(int code, Datum arg);
static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -628,6 +630,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler);
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
+ RegisterTimeout(SKIP_CLIENT_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1239,6 +1242,16 @@ IdleInTransactionSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ if (client_connection_check_interval > 0)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c216ed0922..4a1431477d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3142,6 +3142,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index a21865a77f..c682dcc467 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -674,6 +674,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 1000 # set time interval between
+ # connection checks, in ms
+ # 0 is disabled
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 755819cc58..0048a9ee79 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putbytes(const char *s, size_t len);
+extern void pq_check_client_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -102,6 +103,7 @@ extern WaitEventSet *FeBeWaitSet;
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index c9e35003a5..8b7f72d0d0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -83,7 +83,7 @@ extern PGDLLIMPORT volatile sig_atomic_t QueryCancelPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
-
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index 9244a2a7b7..acd26bb39c 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -31,6 +31,7 @@ typedef enum TimeoutId
STANDBY_TIMEOUT,
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
+ SKIP_CLIENT_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 19d60a506e..526a5149cf 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ connection \
dummy_seclabel \
snapshot_too_old \
test_bloomfilter \
diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile
new file mode 100644
index 0000000000..5c44d7ad40
--- /dev/null
+++ b/src/test/modules/connection/Makefile
@@ -0,0 +1,16 @@
+# src/test/modules/connection/Makefile
+
+subdir = src/test/modules/connection
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+export with_openssl
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean maintainer-clean:
+ rm -rf tmp_check
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
new file mode 100644
index 0000000000..7f727833ec
--- /dev/null
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -0,0 +1,107 @@
+# Check if backend stopped after client disconnection
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use File::Copy;
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ plan tests => 3;
+}
+else
+{
+ plan tests => 2;
+}
+
+my $long_query = q{
+DO
+$$
+DECLARE row_data RECORD;
+BEGIN
+EXECUTE 'CREATE TABLE IF NOT EXISTS keep_alive_test AS SELECT generate_series(0,100000) AS tt';
+FOR row_data IN
+ SELECT tt
+ FROM keep_alive_test
+LOOP
+ EXECUTE 'SELECT count(*) FROM keep_alive_test';
+END LOOP;
+END$$;
+};
+my $set_guc_on = q{
+ SET client_connection_check_interval = 1000;
+};
+my $set_guc_off = q{
+ SET client_connection_check_interval = 0;
+};
+my ($pid, $timed_out);
+
+my $node = get_new_node('node');
+$node->init;
+$node->start;
+
+#########################################################
+# TEST 1: GUC client_connection_check_interval: enabled #
+#########################################################
+
+# Set GUC options, get backend pid and run a long time query
+$node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+
+# Give time to the backend to detect client disconnected
+sleep 3;
+# Check if backend is still alive
+my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '0', 'Test: client_connection_check_interval enable');
+$node->stop;
+
+##########################################################
+# TEST 2: GUC client_connection_check_interval: disabled #
+##########################################################
+
+$node->start;
+$node->psql('postgres', "$set_guc_off SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+# Give time to the client to disconnect
+sleep 3;
+# Check if backend is still alive
+$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '1', 'Test: client_connection_check_interval disable');
+$node->stop;
+
+##########################################################
+# TEST 3: Using client_connection_check_interval when #
+# client connected using SSL #
+##########################################################
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ # The client's private key must not be world-readable, so take a copy
+ # of the key stored in the code tree and update its permissions.
+ copy("../../ssl/ssl/client.key", "../../ssl/ssl/client_tmp.key");
+ chmod 0600, "../../ssl/ssl/client_tmp.key";
+ copy("../../ssl/ssl/client-revoked.key", "../../ssl/ssl/client-revoked_tmp.key");
+ chmod 0600, "../../ssl/ssl/client-revoked_tmp.key";
+ $ENV{PGHOST} = $node->host;
+ $ENV{PGPORT} = $node->port;
+
+ open my $sslconf, '>', $node->data_dir . "/sslconfig.conf";
+ print $sslconf "ssl=on\n";
+ print $sslconf "ssl_cert_file='server-cn-only.crt'\n";
+ print $sslconf "ssl_key_file='server-password.key'\n";
+ print $sslconf "ssl_passphrase_command='echo secret1'\n";
+ close $sslconf;
+
+ $node->start;
+ $node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out,
+ sslmode => 'require');
+
+ # Give time to the backend to detect client disconnected
+ sleep 3;
+ # Check if backend is still alive
+ my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+ is($is_alive, '0', 'Test: client_connection_check_interval enabled, SSL');
+ $node->stop;
+}
On Sat, Feb 9, 2019 at 6:16 AM <s.cherkashin@postgrespro.ru> wrote:
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client. In most cases, the
client must wait for a response from the server before sending new data
- which means there should not remain unread data on the socket and we
will be able to determine a broken connection.
Perhaps exceptions are possible, but I could not think of such a use
case (except COPY). I would be grateful if someone could offer such
cases or their solutions.
I added a test for the GUC variable when the client connects via SSL,
but I'm not sure that this test is really necessary.
Hello Sergey,
This seems like a reasonable idea to me. There is no point in running
a monster 24 hour OLAP query if your client has gone away. It's using
MSG_PEEK which is POSIX, and I can't immediately think of any reason
why it's not safe to try to peek at a byte in that socket at any time.
Could you add a comment to explain why you have !PqCommReadingMsg &&
!PqCommBusy? The tests pass on a couple of different Unixoid OSes I
tried. Is it really necessary to do a bunch of IO and busy CPU work
in $long_query? pg_sleep(60) can do the job, because it includes a
standard CHECK_FOR_INTERRUPTS/latch loop that will loop around on
SIGALRM. I just tested that your patch correctly interrupts
pg_sleep() if I kill -9 my psql process. Why did you call the timeout
SKIP_CLIENT_CHECK_TIMEOUT (I don't understand the "SKIP" part)? Why
not CLIENT_CONNECTION_CHECK_TIMEOUT or something?
I wonder if it's confusing to users that you get "connection to client
lost" if the connection goes away while running a query, but nothing
if the connection goes away without saying goodbye ("X") while idle.
The build fails on Windows. I think it's because
src/tools/msvc/Mkvcbuild.pm is trying to find something to compile
under src/test/modules/connection, and I think the solution is to add
that to the variable @contrib_excludes. (I wonder if that script
should assume there is nothing to build instead of dying with "Could
not determine contrib module type for connection", otherwise every
Unix hacker is bound to get this wrong every time.)
https://ci.appveyor.com/project/postgresql-cfbot/postgresql/build/1.0.45820
Aside from that problem, my Windows CI building thing isn't smart
enough to actually run those extra tests yet, so I don't know if it
actually works on that platform yet (I really need to teach that thing
to use the full buildfarm scripts...)
--
Thomas Munro
https://enterprisedb.com
On Fri, Jul 5, 2019 at 5:36 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Sat, Feb 9, 2019 at 6:16 AM <s.cherkashin@postgrespro.ru> wrote:
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client. In most cases, the
client must wait for a response from the server before sending new data
- which means there should not remain unread data on the socket and we
will be able to determine a broken connection.
Perhaps exceptions are possible, but I could not think of such a use
case (except COPY). I would be grateful if someone could offer such
cases or their solutions.
I added a test for the GUC variable when the client connects via SSL,
but I'm not sure that this test is really necessary.[review]
Email to Sergey is bouncing back. I've set this to "Waiting on
author" in the Commitfest app.
--
Thomas Munro
https://enterprisedb.com
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client.
Pgpool-II already does this by sending a parameter status message to
the client. It is expected that clients are always prepared to receive
the parameter status message. This way I believe we could reliably
detect that the connection to the client is broken or not.
Best regards,
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese:http://www.sraoss.co.jp
This seems like a reasonable idea to me. There is no point in running
a monster 24 hour OLAP query if your client has gone away. It's using
MSG_PEEK which is POSIX, and I can't immediately think of any reason
why it's not safe to try to peek at a byte in that socket at any time.
I am not familiar with Windows but I accidentally found this article
written by Microsoft:
https://support.microsoft.com/en-us/help/192599/info-avoid-data-peeking-in-winsock
It seems using MSG_PEEK is not recommended by Microsoft.
Best regards,
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese:http://www.sraoss.co.jp
On Fri, Jul 5, 2019 at 6:42 PM Tatsuo Ishii <ishii@sraoss.co.jp> wrote:
This seems like a reasonable idea to me. There is no point in running
a monster 24 hour OLAP query if your client has gone away. It's using
MSG_PEEK which is POSIX, and I can't immediately think of any reason
why it's not safe to try to peek at a byte in that socket at any time.I am not familiar with Windows but I accidentally found this article
written by Microsoft:https://support.microsoft.com/en-us/help/192599/info-avoid-data-peeking-in-winsock
It seems using MSG_PEEK is not recommended by Microsoft.
Hmm, interesting. Using it very infrequently just as a way to detect
that the other end has gone away doesn't seem too crazy based on
anything in that article though, does it? What they're saying
actually applies to every operating system, not just Windows, AFAICS.
Namely, don't use MSG_PEEK frequently because it's a syscall and takes
locks in the kernel, and don't use it to wait for full messages to
arrive, or you might effectively deadlock if internal buffers are
full. But Sergey's patch only uses it to check if we could read 1
single byte, and does so very infrequently (the GUC should certainly
be set to at least many seconds).
What else could we do? Assuming the kernel actually knows the
connection has gone away, the WaitEventSetWait() interface is no help
on its own, I think, because it'll just tell you the socket is read
for reading when it's closed, you still have to actually try to read
to distinguish closed from a data byte.
I tried this patch using a real network with two machines. I was able
to get the new "connection to client lost" error by shutting down a
network interface (effectively yanking a cable), but only with TCP
keepalive configured. That's not too surprising; without that and
without trying to write, there is no way for the kernel to know that
the other end has gone.
--
Thomas Munro
https://enterprisedb.com
On Fri, Jul 5, 2019 at 6:28 PM Tatsuo Ishii <ishii@sraoss.co.jp> wrote:
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client.Pgpool-II already does this by sending a parameter status message to
the client. It is expected that clients are always prepared to receive
the parameter status message. This way I believe we could reliably
detect that the connection to the client is broken or not.
Hmm. If you send a message, it's basically application-level
keepalive. But it's a lot harder to be sure that the protocol and
socket are in the right state to insert a message at every possible
CHECK_FOR_INTERRUPT() location. Sergey's proposal of recv(MSG_PEEK)
doesn't require any knowledge of the protocol at all, though it
probably does need TCP keepalive to be configured to be useful for
remote connections.
--
Thomas Munro
https://enterprisedb.com
On 5 Jul 2019, at 11:46, Thomas Munro <thomas.munro@gmail.com> wrote:
On Fri, Jul 5, 2019 at 6:28 PM Tatsuo Ishii <ishii@sraoss.co.jp> wrote:
The purpose of this patch is to stop the execution of continuous
requests in case of a disconnection from the client.Pgpool-II already does this by sending a parameter status message to
the client. It is expected that clients are always prepared to receive
the parameter status message. This way I believe we could reliably
detect that the connection to the client is broken or not.Hmm. If you send a message, it's basically application-level
keepalive. But it's a lot harder to be sure that the protocol and
socket are in the right state to insert a message at every possible
CHECK_FOR_INTERRUPT() location. Sergey's proposal of recv(MSG_PEEK)
doesn't require any knowledge of the protocol at all, though it
probably does need TCP keepalive to be configured to be useful for
remote connections.
Well, indeed in case of cable disconnect only way to detect it with
proposed approach is to have tcp keepalive. However if disconnection
happens due to client application shutdown then client OS should itself
properly close than connection and therefore this patch will detect
such situation without keepalives configured.
--
Stas Kelvich
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Sat, Jul 6, 2019 at 12:27 AM Stas Kelvich <s.kelvich@postgrespro.ru> wrote:
Well, indeed in case of cable disconnect only way to detect it with
proposed approach is to have tcp keepalive. However if disconnection
happens due to client application shutdown then client OS should itself
properly close than connection and therefore this patch will detect
such situation without keepalives configured.
Yeah.
+1 for this patch, with a few adjustments including making the test
use pg_sleep() as mentioned. It does something useful, namely
cancelling very long running queries sooner if the client has gone
away instead of discovering that potentially much later when sending a
response. It does so with a portable kernel interface (though we
haven't heard from a Windows tester), and I think it's using it in a
safe way (we're not doing the various bad things you can do with
MSG_PEEK, and the fd is expected to be valid for the process's
lifetime, and the socket is always in non-blocking mode*, so I don't
think there is any bad time for pg_check_client_connection() to run).
It reuses the existing timer infrastructure so there isn't really any
new overhead. One syscall every 10 seconds or whatever at the next
available CFI is basically nothing. On its own, this patch will
reliably detect clients that closed abruptly or exited/crashed (so
they client kernel sends a FIN packet). In combination with TCP
keepalive, it'll also detect clients that went away because the
network or client kernel ceased to exist.
*There are apparently no callers of pg_set_block(), so if you survived
pq_init() you have a non-blocking socket. If you're on Windows, the
code always sets the magic pgwin32_noblock global flag before trying
to peek. I wondered if it's OK that the CFI would effectively clobber
that with 0 on its way out, but that seems to be OK because every
place in the code that sets that flag does so immediately before an IO
operationg without a CFI in between. As the comment in pgstat.c says
"This is extremely broken and should be fixed someday.". I wonder if
we even need that flag at all now that all socket IO is non-blocking.
--
Thomas Munro
https://enterprisedb.com
Yeah.
+1 for this patch, with a few adjustments including making the test
use pg_sleep() as mentioned. It does something useful, namely
cancelling very long running queries sooner if the client has gone
away instead of discovering that potentially much later when sending a
response. It does so with a portable kernel interface (though we
haven't heard from a Windows tester), and I think it's using it in a
safe way (we're not doing the various bad things you can do with
MSG_PEEK, and the fd is expected to be valid for the process's
lifetime, and the socket is always in non-blocking mode*, so I don't
think there is any bad time for pg_check_client_connection() to run).
It reuses the existing timer infrastructure so there isn't really any
new overhead. One syscall every 10 seconds or whatever at the next
available CFI is basically nothing. On its own, this patch will
reliably detect clients that closed abruptly or exited/crashed (so
they client kernel sends a FIN packet). In combination with TCP
keepalive, it'll also detect clients that went away because the
network or client kernel ceased to exist.*There are apparently no callers of pg_set_block(), so if you survived
pq_init() you have a non-blocking socket. If you're on Windows, the
code always sets the magic pgwin32_noblock global flag before trying
to peek. I wondered if it's OK that the CFI would effectively clobber
that with 0 on its way out, but that seems to be OK because every
place in the code that sets that flag does so immediately before an IO
operationg without a CFI in between. As the comment in pgstat.c says
"This is extremely broken and should be fixed someday.". I wonder if
we even need that flag at all now that all socket IO is non-blocking.
I have looked into the patch and tested a little bit.
First of all, I had to grab February snapshot to test the patch
because it does not apply to the current HEAD. I noticed that there
are some confusions in the doc and code regarding what the new
configuration parameter means. According to the doc:
+ Default value is <literal>zero</literal> - it disables connection
+ checks, so the backend will detect client disconnection only when trying
+ to send a response to the query.
But guc.c comment says:
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),
Probably the doc is correct since the actual code does so.
Also I found this in postgresql.conf default:
#client_connection_check_interval = 1000 # set time interval between
So here the default value seems to be be 1000. If so, guc.c should be
adjusted and the doc should be changed accordingly. I am not sure.
Next I have tested the patch using standard pgbench.
With the feature enabled with 1000ms check interval:
$ pgbench -c 10 -T 300 -S test
starting vacuum...end.
transaction type: <builtin: select only>
scaling factor: 1
query mode: simple
number of clients: 10
number of threads: 1
duration: 300 s
number of transactions actually processed: 19347995
latency average = 0.155 ms
tps = 64493.278581 (including connections establishing)
tps = 64493.811945 (excluding connections establishing)
Without the feature (client-connection-check-interval = 0)
$ pgbench -c 10 -T 300 -S test
starting vacuum...end.
transaction type: <builtin: select only>
scaling factor: 1
query mode: simple
number of clients: 10
number of threads: 1
duration: 300 s
number of transactions actually processed: 20314812
latency average = 0.148 ms
tps = 67715.993428 (including connections establishing)
tps = 67717.251843 (excluding connections establishing)
So the performance is about 5% down with the feature enabled in this
case. For me, 5% down is not subtle. Probably we should warn this in
the doc.
Best regards,
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese:http://www.sraoss.co.jp
On Thu, Jul 18, 2019 at 3:19 PM Tatsuo Ishii <ishii@sraoss.co.jp> wrote:
So the performance is about 5% down with the feature enabled in this
case. For me, 5% down is not subtle. Probably we should warn this in
the doc.
Yeah, the timer logic is wrong. I didn't have time to look into it
but with truss/strace for some reason I see 3 setitimer() syscalls for
every query, but I think this doesn't even need to set the timer for
every query.
--
Thomas Munro
https://enterprisedb.com
Yeah, the timer logic is wrong. I didn't have time to look into it
but with truss/strace for some reason I see 3 setitimer() syscalls for
every query, but I think this doesn't even need to set the timer for
every query.
Hum. I see 2 settimer(), instead of 3.
Best regards,
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese:http://www.sraoss.co.jp
Tatsuo Ishii <ishii@sraoss.co.jp> writes:
Yeah, the timer logic is wrong. I didn't have time to look into it
but with truss/strace for some reason I see 3 setitimer() syscalls for
every query, but I think this doesn't even need to set the timer for
every query.
Hum. I see 2 settimer(), instead of 3.
src/backend/utils/misc/timeout.c is not really designed for there
to be timeouts that persist across multiple queries. It can probably
be made better, but this patch doesn't appear to have touched any of
that logic.
To point to just one obvious problem, the error recovery path
(sigsetjmp block) in postgres.c does
disable_all_timeouts(false);
which cancels *all* timeouts. Probably don't want that behavior
anymore.
I think the issue you're complaining about may have to do with
the fact that if there's no statement timeout active, both
enable_statement_timeout and disable_statement_timeout will
call "disable_timeout(STATEMENT_TIMEOUT, false);". That does
nothing, as desired, if there are no other active timeouts ...
but if there is one, ie the client_connection timeout, we'll
end up calling schedule_alarm which will call setitimer even
if the desired time-of-nearest-timeout hasn't changed.
That was OK behavior for the set of timeouts that the code
was designed to handle, but we're going to need to be smarter
now.
regards, tom lane
On Thu, Jul 18, 2019 at 5:04 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:
Tatsuo Ishii <ishii@sraoss.co.jp> writes:
Yeah, the timer logic is wrong. I didn't have time to look into it
but with truss/strace for some reason I see 3 setitimer() syscalls for
every query, but I think this doesn't even need to set the timer for
every query.Hum. I see 2 settimer(), instead of 3.
src/backend/utils/misc/timeout.c is not really designed for there
to be timeouts that persist across multiple queries. It can probably
be made better, but this patch doesn't appear to have touched any of
that logic.To point to just one obvious problem, the error recovery path
(sigsetjmp block) in postgres.c doesdisable_all_timeouts(false);
which cancels *all* timeouts. Probably don't want that behavior
anymore.I think the issue you're complaining about may have to do with
the fact that if there's no statement timeout active, both
enable_statement_timeout and disable_statement_timeout will
call "disable_timeout(STATEMENT_TIMEOUT, false);". That does
nothing, as desired, if there are no other active timeouts ...
but if there is one, ie the client_connection timeout, we'll
end up calling schedule_alarm which will call setitimer even
if the desired time-of-nearest-timeout hasn't changed.
That was OK behavior for the set of timeouts that the code
was designed to handle, but we're going to need to be smarter
now.
Ok, I think we like this feature proposal and I think we have a pretty
good handle on what needs to be done next, but without a patch author
that's not going to happen. I've therefore marked it "Returned with
feedback". If Sergey or someone else is interested in picking this up
and doing the work in time for CF2, please feel free to change that to
'moved to next'.
I wonder if it'd be possible, along the way, to make it so that
statement_timeout doesn't require calling itimer() for every
statement, instead letting the existing timer run if there is a
reasonable amount left to run on it, and then resetting it if needed.
I haven't looked into whether that's hard/impossible due to some race
condition I haven't spent the time to think about, but if Ishii-san
sees 5% slowdown from a couple of itimer calls(), I guess using
statement_timeout might currently cause a 2.5% slowdown.
--
Thomas Munro
https://enterprisedb.com
On 18.07.2019 6:19, Tatsuo Ishii wrote:
I noticed that there are some confusions in the doc and code regarding what the new
configuration parameter means. According to the doc:+ Default value is <literal>zero</literal> - it disables connection + checks, so the backend will detect client disconnection only when trying + to send a response to the query.But guc.c comment says:
+ gettext_noop("Sets the time interval for checking connection with the client."), + gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),Probably the doc is correct since the actual code does so.
Yes, value -1 is not even accepted due to the specified range.
tps = 67715.993428 (including connections establishing)
tps = 67717.251843 (excluding connections establishing)So the performance is about 5% down with the feature enabled in this
case. For me, 5% down is not subtle. Probably we should warn this in
the doc.
I also see some performance degradation, although it is not so large in
my case (I used pgbench with scale factor 10 and run the same command as
you).
In my case difference is 103k vs. 105k TPS is smaller than 2%.
It seems to me that it is not necessary to enable timeout at each command:
@@ -4208,6 +4210,9 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
/*
* (5) turn off the idle-in-transaction timeout
Unlike statement timeout or idle in transaction timeout price start of
measuring time is not important.
So it is possible to do once before main backend loop:
@@ -3981,6 +3983,10 @@ PostgresMain(int argc, char *argv[],
if (!IsUnderPostmaster)
PgStartTime = GetCurrentTimestamp();
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
+
/*
* POSTGRES main processing loop begins here
*
But actually I do not see much difference from moving enabling timeout code.
Moreover the difference in performance almost not depend on the value of
timeout.
I set it to 100 seconds with pgbench loop 30 seconds (so timeout never
fired and recv is never called)
and still there is small difference in performance.
After some experiments I found out that just presence of active timer
results some small performance penalty.
You can easily check it: set for example statement_timeout to the same
large value (100 seconds) and you will get the same small slowdown.
So recv() itself is not source of the problem.
Actually any system call (may be except fsync) performed with frequency
less than one second can not have some noticeable impact on performance.
So I do not think that recv(MSG_PEEK) can cause any performance problem
at Windows or any other platform.
But I wonder why we can not perform just pool with POLLOUT flag and zero
timeout.
If OS detected closed connection, it should return POLLHUP, should not it?
I am not sure if it is more portable or more efficient way - just seems
to be a little bit more natural way (from my point of view) to check if
connection is still alive.
--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Sat, Aug 3, 2019 at 4:40 AM Konstantin Knizhnik
<k.knizhnik@postgrespro.ru> wrote:
On 18.07.2019 6:19, Tatsuo Ishii wrote:
So the performance is about 5% down with the feature enabled in this
case. For me, 5% down is not subtle. Probably we should warn this in
the doc.I also see some performance degradation, although it is not so large in
my case (I used pgbench with scale factor 10 and run the same command as
you).
In my case difference is 103k vs. 105k TPS is smaller than 2%.
I didn't test, but hopefully the degradation is fixed by commit 09cf1d52?
If OS detected closed connection, it should return POLLHUP, should not it?
I am not sure if it is more portable or more efficient way - just seems
to be a little bit more natural way (from my point of view) to check if
connection is still alive.
That's if you're sleeping inepoll etc. This patch is for CPU-bound
backends, running a long query. We need to do something special to
find out if the kernel knows that the connection has been closed.
I've done a quick rebase of this the patch and added it to the
commitfest. No other changes. Several things were mentioned earlier
that still need to be tidied up.
Attachments:
v3-0001-Detect-dropped-connections-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v3-0001-Detect-dropped-connections-while-running-queries.patchDownload
From 5f7c327c5896369b80529467a3f1f64eab690887 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v3] Detect dropped connections while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: <your name here>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 20 ++++
src/backend/libpq/pqcomm.c | 31 +++++
src/backend/tcop/postgres.c | 5 +
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 13 +++
src/backend/utils/misc/guc.c | 10 ++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 3 +-
src/include/utils/timeout.h | 1 +
src/test/modules/Makefile | 1 +
src/test/modules/connection/Makefile | 16 +++
.../connection/t/001_close_connection.pl | 107 ++++++++++++++++++
13 files changed, 212 insertions(+), 1 deletion(-)
create mode 100644 src/test/modules/connection/Makefile
create mode 100644 src/test/modules/connection/t/001_close_connection.pl
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b5718fc136..db8798eb95 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9220,6 +9220,26 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets a time interval, in milliseconds, between periodic
+ verification of client-server connection during query execution.
+ If the client aborts the connection, the query is terminated.
+ </para>
+ <para>
+ Default value is <literal>zero</literal> - it disables connection
+ checks, so the backend will detect client disconnection only when trying
+ to send a response to the query.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
</sect1>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 27a298f110..44fb79c2b5 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -118,6 +118,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -2022,3 +2023,33 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/* --------------------------------
+ * pq_check_client_connection - check if client connected to socket or not
+ * --------------------------------
+ */
+void pq_check_client_connection(void)
+{
+ CheckClientConnectionPending = false;
+ if (IsUnderPostmaster &&
+ MyProcPort != NULL && !PqCommReadingMsg && !PqCommBusy)
+ {
+ char nextbyte;
+ int r;
+
+#ifdef WIN32
+ pgwin32_noblock = 1;
+#endif
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
+#ifdef WIN32
+ pgwin32_noblock = 0;
+#endif
+
+ if (r == 0 || (r == -1 &&
+ errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR))
+ {
+ ClientConnectionLost = true;
+ InterruptPending = true;
+ }
+ }
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index bb5ccb4578..4ce004a2d5 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3128,6 +3128,8 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ pq_check_client_connection();
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
@@ -4355,6 +4357,9 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
/*
* (6) check for any other interesting events that happened while we
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index a5976ad5b1..b42a516bc4 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index e5965bc517..002b6c9fd1 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -621,6 +623,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(SKIP_CLIENT_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1243,6 +1246,16 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ if (client_connection_check_interval > 0)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d626731723..8700703af6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3445,6 +3445,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ee06528bb0..6609c1426f 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 1000 # set time interval between
+ # connection checks, in ms
+ # 0 is disabled
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index e4e5c21565..149af0d0b6 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -76,6 +76,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putbytes(const char *s, size_t len);
+extern void pq_check_client_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -114,6 +115,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1bdc97e308..39fe759e9e 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -84,7 +84,8 @@ extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
-
+extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..89d94ebe18 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ SKIP_CLIENT_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 5391f461a2..ac0196f711 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ connection \
delay_execution \
dummy_index_am \
dummy_seclabel \
diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile
new file mode 100644
index 0000000000..5c44d7ad40
--- /dev/null
+++ b/src/test/modules/connection/Makefile
@@ -0,0 +1,16 @@
+# src/test/modules/connection/Makefile
+
+subdir = src/test/modules/connection
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+export with_openssl
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean maintainer-clean:
+ rm -rf tmp_check
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
new file mode 100644
index 0000000000..7f727833ec
--- /dev/null
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -0,0 +1,107 @@
+# Check if backend stopped after client disconnection
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use File::Copy;
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ plan tests => 3;
+}
+else
+{
+ plan tests => 2;
+}
+
+my $long_query = q{
+DO
+$$
+DECLARE row_data RECORD;
+BEGIN
+EXECUTE 'CREATE TABLE IF NOT EXISTS keep_alive_test AS SELECT generate_series(0,100000) AS tt';
+FOR row_data IN
+ SELECT tt
+ FROM keep_alive_test
+LOOP
+ EXECUTE 'SELECT count(*) FROM keep_alive_test';
+END LOOP;
+END$$;
+};
+my $set_guc_on = q{
+ SET client_connection_check_interval = 1000;
+};
+my $set_guc_off = q{
+ SET client_connection_check_interval = 0;
+};
+my ($pid, $timed_out);
+
+my $node = get_new_node('node');
+$node->init;
+$node->start;
+
+#########################################################
+# TEST 1: GUC client_connection_check_interval: enabled #
+#########################################################
+
+# Set GUC options, get backend pid and run a long time query
+$node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+
+# Give time to the backend to detect client disconnected
+sleep 3;
+# Check if backend is still alive
+my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '0', 'Test: client_connection_check_interval enable');
+$node->stop;
+
+##########################################################
+# TEST 2: GUC client_connection_check_interval: disabled #
+##########################################################
+
+$node->start;
+$node->psql('postgres', "$set_guc_off SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+# Give time to the client to disconnect
+sleep 3;
+# Check if backend is still alive
+$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '1', 'Test: client_connection_check_interval disable');
+$node->stop;
+
+##########################################################
+# TEST 3: Using client_connection_check_interval when #
+# client connected using SSL #
+##########################################################
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ # The client's private key must not be world-readable, so take a copy
+ # of the key stored in the code tree and update its permissions.
+ copy("../../ssl/ssl/client.key", "../../ssl/ssl/client_tmp.key");
+ chmod 0600, "../../ssl/ssl/client_tmp.key";
+ copy("../../ssl/ssl/client-revoked.key", "../../ssl/ssl/client-revoked_tmp.key");
+ chmod 0600, "../../ssl/ssl/client-revoked_tmp.key";
+ $ENV{PGHOST} = $node->host;
+ $ENV{PGPORT} = $node->port;
+
+ open my $sslconf, '>', $node->data_dir . "/sslconfig.conf";
+ print $sslconf "ssl=on\n";
+ print $sslconf "ssl_cert_file='server-cn-only.crt'\n";
+ print $sslconf "ssl_key_file='server-password.key'\n";
+ print $sslconf "ssl_passphrase_command='echo secret1'\n";
+ close $sslconf;
+
+ $node->start;
+ $node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out,
+ sslmode => 'require');
+
+ # Give time to the backend to detect client disconnected
+ sleep 3;
+ # Check if backend is still alive
+ my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+ is($is_alive, '0', 'Test: client_connection_check_interval enabled, SSL');
+ $node->stop;
+}
--
2.30.0
On Mon, Mar 1, 2021 at 6:18 PM Thomas Munro <thomas.munro@gmail.com> wrote:
I've done a quick rebase of this the patch and added it to the
commitfest. No other changes. Several things were mentioned earlier
that still need to be tidied up.
Rebased again due to bitrot. This time I did some actual work:
1. I didn't like the way it was rearming the timer *in the timer
handler*; I think it should be done in the CFI(), and only if it
determines that you're still running a query (otherwise you'll get
periodic wakeups while you're idle between quieries, which is bad for
the arctic ice cap; we already handle client going away efficiently
between queries with WaitEventSet socket readiness).
2. The timer handler surely has to set the latch to close a race (cf.
other similar handlers; between the CFI() and the beginning of the
sleep, you could handle the signal, set the flag, and then go to sleep
for 100 years).
3. The test might as well use pg_sleep() instead of doing a plpgsql
busy loop of SELECT queries.
4. I prefer the name CLIENT_CONNECTION_CHECK_TIMEOUT instead of
SKIP_CLIENT_CHECK_TIMEOUT; let's make up only one new name for a
concept instead of two.
5. Miniscule doc change.
I put these into a separate patch for ease of review. I don't claim
this is ready -- still needs more testing etc -- but it seems to be
generating the right system calls at the right times now.
Attachments:
v4-0001-Detect-dropped-connections-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Detect-dropped-connections-while-running-queries.patchDownload
From 8962b1ac40cf63c90af3e65dc15d554d764483dc Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v4 1/2] Detect dropped connections while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: <your name here>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 20 ++++
src/backend/libpq/pqcomm.c | 31 +++++
src/backend/tcop/postgres.c | 5 +
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 13 +++
src/backend/utils/misc/guc.c | 10 ++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 3 +-
src/include/utils/timeout.h | 1 +
src/test/modules/Makefile | 1 +
src/test/modules/connection/Makefile | 16 +++
.../connection/t/001_close_connection.pl | 107 ++++++++++++++++++
13 files changed, 212 insertions(+), 1 deletion(-)
create mode 100644 src/test/modules/connection/Makefile
create mode 100644 src/test/modules/connection/t/001_close_connection.pl
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 967de73596..cadf6a9e7d 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9231,6 +9231,26 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets a time interval, in milliseconds, between periodic
+ verification of client-server connection during query execution.
+ If the client aborts the connection, the query is terminated.
+ </para>
+ <para>
+ Default value is <literal>zero</literal> - it disables connection
+ checks, so the backend will detect client disconnection only when trying
+ to send a response to the query.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
</sect1>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..9f33138105 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -104,6 +104,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -1921,3 +1922,33 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/* --------------------------------
+ * pq_check_client_connection - check if client connected to socket or not
+ * --------------------------------
+ */
+void pq_check_client_connection(void)
+{
+ CheckClientConnectionPending = false;
+ if (IsUnderPostmaster &&
+ MyProcPort != NULL && !PqCommReadingMsg && !PqCommBusy)
+ {
+ char nextbyte;
+ int r;
+
+#ifdef WIN32
+ pgwin32_noblock = 1;
+#endif
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
+#ifdef WIN32
+ pgwin32_noblock = 0;
+#endif
+
+ if (r == 0 || (r == -1 &&
+ errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR))
+ {
+ ClientConnectionLost = true;
+ InterruptPending = true;
+ }
+ }
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 8a0332dde9..c9be532362 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3061,6 +3061,8 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ pq_check_client_connection();
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
@@ -4288,6 +4290,9 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
+ if (client_connection_check_interval)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
/*
* (6) check for any other interesting events that happened while we
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index a5976ad5b1..b42a516bc4 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index e5965bc517..002b6c9fd1 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -621,6 +623,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(SKIP_CLIENT_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1243,6 +1246,16 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ if (client_connection_check_interval > 0)
+ enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
+ client_connection_check_interval);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3fd1a5fbe2..53db7b042e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3446,6 +3446,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of -1 disables this feature. Zero selects a suitable default value."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ee06528bb0..6609c1426f 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 1000 # set time interval between
+ # connection checks, in ms
+ # 0 is disabled
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..55e605a82e 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern void pq_check_client_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1bdc97e308..39fe759e9e 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -84,7 +84,8 @@ extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
-
+extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..89d94ebe18 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ SKIP_CLIENT_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 5391f461a2..ac0196f711 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ connection \
delay_execution \
dummy_index_am \
dummy_seclabel \
diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile
new file mode 100644
index 0000000000..5c44d7ad40
--- /dev/null
+++ b/src/test/modules/connection/Makefile
@@ -0,0 +1,16 @@
+# src/test/modules/connection/Makefile
+
+subdir = src/test/modules/connection
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+export with_openssl
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean maintainer-clean:
+ rm -rf tmp_check
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
new file mode 100644
index 0000000000..7f727833ec
--- /dev/null
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -0,0 +1,107 @@
+# Check if backend stopped after client disconnection
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use File::Copy;
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ plan tests => 3;
+}
+else
+{
+ plan tests => 2;
+}
+
+my $long_query = q{
+DO
+$$
+DECLARE row_data RECORD;
+BEGIN
+EXECUTE 'CREATE TABLE IF NOT EXISTS keep_alive_test AS SELECT generate_series(0,100000) AS tt';
+FOR row_data IN
+ SELECT tt
+ FROM keep_alive_test
+LOOP
+ EXECUTE 'SELECT count(*) FROM keep_alive_test';
+END LOOP;
+END$$;
+};
+my $set_guc_on = q{
+ SET client_connection_check_interval = 1000;
+};
+my $set_guc_off = q{
+ SET client_connection_check_interval = 0;
+};
+my ($pid, $timed_out);
+
+my $node = get_new_node('node');
+$node->init;
+$node->start;
+
+#########################################################
+# TEST 1: GUC client_connection_check_interval: enabled #
+#########################################################
+
+# Set GUC options, get backend pid and run a long time query
+$node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+
+# Give time to the backend to detect client disconnected
+sleep 3;
+# Check if backend is still alive
+my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '0', 'Test: client_connection_check_interval enable');
+$node->stop;
+
+##########################################################
+# TEST 2: GUC client_connection_check_interval: disabled #
+##########################################################
+
+$node->start;
+$node->psql('postgres', "$set_guc_off SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+# Give time to the client to disconnect
+sleep 3;
+# Check if backend is still alive
+$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '1', 'Test: client_connection_check_interval disable');
+$node->stop;
+
+##########################################################
+# TEST 3: Using client_connection_check_interval when #
+# client connected using SSL #
+##########################################################
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ # The client's private key must not be world-readable, so take a copy
+ # of the key stored in the code tree and update its permissions.
+ copy("../../ssl/ssl/client.key", "../../ssl/ssl/client_tmp.key");
+ chmod 0600, "../../ssl/ssl/client_tmp.key";
+ copy("../../ssl/ssl/client-revoked.key", "../../ssl/ssl/client-revoked_tmp.key");
+ chmod 0600, "../../ssl/ssl/client-revoked_tmp.key";
+ $ENV{PGHOST} = $node->host;
+ $ENV{PGPORT} = $node->port;
+
+ open my $sslconf, '>', $node->data_dir . "/sslconfig.conf";
+ print $sslconf "ssl=on\n";
+ print $sslconf "ssl_cert_file='server-cn-only.crt'\n";
+ print $sslconf "ssl_key_file='server-password.key'\n";
+ print $sslconf "ssl_passphrase_command='echo secret1'\n";
+ close $sslconf;
+
+ $node->start;
+ $node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out,
+ sslmode => 'require');
+
+ # Give time to the backend to detect client disconnected
+ sleep 3;
+ # Check if backend is still alive
+ my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+ is($is_alive, '0', 'Test: client_connection_check_interval enabled, SSL');
+ $node->stop;
+}
--
2.30.1
v4-0002-some-fixups.patchtext/x-patch; charset=US-ASCII; name=v4-0002-some-fixups.patchDownload
From 54e412efbf61fb1fef1dc9444b1bbfc13f7cabe0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 6 Mar 2021 16:09:39 +1300
Subject: [PATCH v4 2/2] some fixups
- timer handler must set the latch
- rearm the timer from the CFI(), not the signal handler
- only rearm if we're actually still running a query (avoids wakeups
when idle)
- just use pg_sleep() in the test instead of working harder
- SKIP_CLIENT_CHECK_TIMEOUT -> CLIENT_CONNECTION_CHECK_TIMEOUT
- a tiny doc change
---
doc/src/sgml/config.sgml | 2 +-
src/backend/libpq/pqcomm.c | 5 +++++
src/backend/tcop/postgres.c | 8 +++++---
src/backend/utils/init/postinit.c | 6 ++----
src/include/utils/timeout.h | 2 +-
.../modules/connection/t/001_close_connection.pl | 13 +------------
6 files changed, 15 insertions(+), 21 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index cadf6a9e7d..a8db1cc806 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9244,7 +9244,7 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
If the client aborts the connection, the query is terminated.
</para>
<para>
- Default value is <literal>zero</literal> - it disables connection
+ Default value is <literal>zero</literal>. Zero disables connection
checks, so the backend will detect client disconnection only when trying
to send a response to the query.
</para>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 9f33138105..bf26fda1c0 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -79,6 +79,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#include "utils/timeout.h"
/*
* Cope with the various platform-specific ways to spell TCP keepalive socket
@@ -1950,5 +1951,9 @@ void pq_check_client_connection(void)
ClientConnectionLost = true;
InterruptPending = true;
}
+
+ if (client_connection_check_interval > 0)
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index c9be532362..85a2dcdd5b 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2583,6 +2583,11 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval)
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -4290,9 +4295,6 @@ PostgresMain(int argc, char *argv[],
*/
CHECK_FOR_INTERRUPTS();
DoingCommandRead = false;
- if (client_connection_check_interval)
- enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
- client_connection_check_interval);
/*
* (6) check for any other interesting events that happened while we
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 002b6c9fd1..2360c38bfd 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -623,7 +623,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
- RegisterTimeout(SKIP_CLIENT_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1251,9 +1251,7 @@ ClientCheckTimeoutHandler(void)
{
CheckClientConnectionPending = true;
InterruptPending = true;
- if (client_connection_check_interval > 0)
- enable_timeout_after(SKIP_CLIENT_CHECK_TIMEOUT,
- client_connection_check_interval);
+ SetLatch(MyLatch);
}
/*
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index 89d94ebe18..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,7 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
- SKIP_CLIENT_CHECK_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
index 7f727833ec..f688fdb666 100644
--- a/src/test/modules/connection/t/001_close_connection.pl
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -16,18 +16,7 @@ else
}
my $long_query = q{
-DO
-$$
-DECLARE row_data RECORD;
-BEGIN
-EXECUTE 'CREATE TABLE IF NOT EXISTS keep_alive_test AS SELECT generate_series(0,100000) AS tt';
-FOR row_data IN
- SELECT tt
- FROM keep_alive_test
-LOOP
- EXECUTE 'SELECT count(*) FROM keep_alive_test';
-END LOOP;
-END$$;
+SELECT pg_sleep(60);
};
my $set_guc_on = q{
SET client_connection_check_interval = 1000;
--
2.30.1
For v4-0002-some-fixups.patch :
+ if (client_connection_check_interval > 0)
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ /* Start timeout for checking if the client has gone away if necessary.
*/
+ if (client_connection_check_interval)
It would be better if the second if condition is aligned with that of the
first (> 0).
For v4-0001-Detect-dropped-connections-while-running-queries.patch :
+ Sets a time interval, in milliseconds, between periodic
I wonder if the interval should be expressed in seconds. Since the
description says: while running very long queries.
Cheers
On Fri, Mar 5, 2021 at 8:07 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Show quoted text
On Mon, Mar 1, 2021 at 6:18 PM Thomas Munro <thomas.munro@gmail.com>
wrote:I've done a quick rebase of this the patch and added it to the
commitfest. No other changes. Several things were mentioned earlier
that still need to be tidied up.Rebased again due to bitrot. This time I did some actual work:
1. I didn't like the way it was rearming the timer *in the timer
handler*; I think it should be done in the CFI(), and only if it
determines that you're still running a query (otherwise you'll get
periodic wakeups while you're idle between quieries, which is bad for
the arctic ice cap; we already handle client going away efficiently
between queries with WaitEventSet socket readiness).
2. The timer handler surely has to set the latch to close a race (cf.
other similar handlers; between the CFI() and the beginning of the
sleep, you could handle the signal, set the flag, and then go to sleep
for 100 years).
3. The test might as well use pg_sleep() instead of doing a plpgsql
busy loop of SELECT queries.
4. I prefer the name CLIENT_CONNECTION_CHECK_TIMEOUT instead of
SKIP_CLIENT_CHECK_TIMEOUT; let's make up only one new name for a
concept instead of two.
5. Miniscule doc change.I put these into a separate patch for ease of review. I don't claim
this is ready -- still needs more testing etc -- but it seems to be
generating the right system calls at the right times now.
On Sat, Mar 6, 2021 at 5:50 PM Zhihong Yu <zyu@yugabyte.com> wrote:
+ if (client_connection_check_interval > 0) + enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,+ /* Start timeout for checking if the client has gone away if necessary. */ + if (client_connection_check_interval)
Thanks! Fixed.
+ Sets a time interval, in milliseconds, between periodic
I wonder if the interval should be expressed in seconds. Since the description says: while running very long queries.
Hmm. Personally I think we should stop emphasising default units
(it's an internal detail) and encourage people to specify the units.
But that was indeed not good wording, so I fixed it. I've now
rewritten the documentation completely, and moved it to the connection
section near the related TCP keepalive settings. I tried to be really
explicit about what this feature really does, and in which cases it
helps, and what behaviour you can expect without this feature
configured.
Other changes:
1. Fixed inconsistencies in .conf.sample and GUC descriptions from by
Ishii-san's review.
2. Added comments to pg_check_client_connection() to explain the
conditional logic therein, justifying each conditional branch and the
interactions between this logic and the PqCommReadingMsg and
ClientConnectionLost variables.
3. I added an ereport(COMMERROR) message for unexpected errnos in
this path, since otherwise an errno would be discarded making
diagnosis impossible.
4. Avoided calling enable_timeout_after() multiple times, like we do
for statement timeouts.
5. cfbot told me "Could not determine contrib module type for
connection" on Windows. I do not understand this stuff, so I just
added the new test module "connection" to @contrib_excludes, like
everyone else apparently does.
6. pgindented.
That's enough for today, but here are some things I'm still wondering about:
1. Might need to rethink the name of the GUC. By including "client"
in it, it sounds a bit like it affects behaviour of the client, rather
than the server. Also the internal variable
CheckClientConnectionPending looks funny next to ClientConnectionLost
(could be ClientConnectionCheckPending?).
2. The tests need tightening up. The thing with the "sleep 3" will
not survive contact with the build farm, and I'm not sure if the SSL
test is as short as it could be.
3. Needs testing on Windows.
I've now hacked this code around so much that I've added myself as
co-author in the commit message.
Attachments:
v5-0001-Detect-dropped-connections-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v5-0001-Detect-dropped-connections-while-running-queries.patchDownload
From 9a8af3fe03d9d7673f163e6087dd724acc40161d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v5] Detect dropped connections while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 36 +++++++
src/backend/libpq/pqcomm.c | 91 ++++++++++++++++++
src/backend/tcop/postgres.c | 8 ++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 11 +++
src/backend/utils/misc/guc.c | 10 ++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 3 +-
src/include/utils/timeout.h | 1 +
src/test/modules/Makefile | 1 +
src/test/modules/connection/Makefile | 16 ++++
.../connection/t/001_close_connection.pl | 96 +++++++++++++++++++
src/tools/msvc/Mkvcbuild.pm | 4 +-
14 files changed, 281 insertions(+), 2 deletions(-)
create mode 100644 src/test/modules/connection/Makefile
create mode 100644 src/test/modules/connection/t/001_close_connection.pl
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ee4925d6d9..fc463f0a65 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,42 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is performed by testing
+ whether one byte could be read from the socket with
+ <symbol>MSG_PEEK</symbol>. If the kernel reports that the connection
+ has been closed or lost, a long running query can abort immediately,
+ rather than discovering the problem when it eventually tries to send
+ the response.
+ </para>
+ <para>
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only when it is waiting for a new request, receiving a
+ request or sending a response.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and
+ within a known timeframe in all scenarios including network failure, it
+ may also be necessary to adjust the default TCP keepalive settings of
+ the operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-idle"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..e5f999b898 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -79,6 +79,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#include "utils/timeout.h"
/*
* Cope with the various platform-specific ways to spell TCP keepalive socket
@@ -104,6 +105,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -1921,3 +1923,92 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/* --------------------------------
+ * pq_check_client_connection - check if client is still connected
+ * --------------------------------
+ */
+void
+pq_check_client_connection(void)
+{
+ CheckClientConnectionPending = false;
+
+ /*
+ * We were called from CHECK_FOR_INTERRUPTS(), because
+ * client_connection_check_interval is set and the timer recently fired, so
+ * it's time to check if the kernel thinks the client is still there. This
+ * is a useful thing to do while the executor is doing busy work for a long
+ * time without any other kind of interaction with the socket.
+ *
+ * We'll only perform the check and re-arm the timer if we're possibly
+ * still running a query. We don't need to do any checks when we're
+ * sitting idle between queries, because in that case the FeBeWaitSet will
+ * wake up when the socket becomes ready to read, including lost
+ * connections. If a later query begins, the timer will be enabled afresh.
+ */
+ if (IsUnderPostmaster &&
+ MyProcPort != NULL &&
+ !PqCommReadingMsg &&
+ !PqCommBusy)
+ {
+ bool connection_lost = false;
+ char nextbyte;
+ int r;
+
+retry:
+#ifdef WIN32
+ pgwin32_noblock = 1;
+#endif
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
+#ifdef WIN32
+ pgwin32_noblock = 0;
+#endif
+
+ if (r == 0)
+ {
+ /* EOF detected. */
+ connection_lost = true;
+ }
+ else if (r > 0)
+ {
+ /* Data available to read. Connection looks good. */
+ }
+ else if (errno == EINTR)
+ {
+ /* Interrupted by a signal, so retry. */
+ goto retry;
+ }
+ else if (errno == EAGAIN || errno == EWOULDBLOCK)
+ {
+ /* No data available to read. Connection looks good. */
+ }
+ else
+ {
+ /* Got some other error. We'd better log the reason. */
+ ereport(COMMERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("could not check client connection: %m")));
+ connection_lost = true;
+ }
+
+ if (connection_lost)
+ {
+ /*
+ * We're already in ProcessInterrupts(), and its check for
+ * ClientConnectionLost comes after the check for
+ * CheckClientConnectionPending. It seems a little fragile to
+ * rely on that here, so we'll also set InterruptPending to make
+ * sure that the next CHECK_FOR_INTERRUPTS() could handle it too
+ * if the code moves around.
+ */
+ ClientConnectionLost = true;
+ InterruptPending = true;
+ }
+ else if (client_connection_check_interval > 0)
+ {
+ /* Schedule the next check, because the GUC is still enabled. */
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..6d6f942b3f 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2671,6 +2671,12 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3155,8 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ pq_check_client_connection();
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..0bac23e75d 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3b36a31a47..391d9983e0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of 0 disables this feature."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 86425965d0..dd850bf272 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # set time interval between
+ # checks for client disconnection while
+ # running long queries; 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..55e605a82e 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern void pq_check_client_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..40fcaff25f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -84,7 +84,8 @@ extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending;
extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
-
+extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 93e7829c67..db51a1d700 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ connection \
delay_execution \
dummy_index_am \
dummy_seclabel \
diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile
new file mode 100644
index 0000000000..5c44d7ad40
--- /dev/null
+++ b/src/test/modules/connection/Makefile
@@ -0,0 +1,16 @@
+# src/test/modules/connection/Makefile
+
+subdir = src/test/modules/connection
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+export with_openssl
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean maintainer-clean:
+ rm -rf tmp_check
diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl
new file mode 100644
index 0000000000..f688fdb666
--- /dev/null
+++ b/src/test/modules/connection/t/001_close_connection.pl
@@ -0,0 +1,96 @@
+# Check if backend stopped after client disconnection
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use File::Copy;
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ plan tests => 3;
+}
+else
+{
+ plan tests => 2;
+}
+
+my $long_query = q{
+SELECT pg_sleep(60);
+};
+my $set_guc_on = q{
+ SET client_connection_check_interval = 1000;
+};
+my $set_guc_off = q{
+ SET client_connection_check_interval = 0;
+};
+my ($pid, $timed_out);
+
+my $node = get_new_node('node');
+$node->init;
+$node->start;
+
+#########################################################
+# TEST 1: GUC client_connection_check_interval: enabled #
+#########################################################
+
+# Set GUC options, get backend pid and run a long time query
+$node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+
+# Give time to the backend to detect client disconnected
+sleep 3;
+# Check if backend is still alive
+my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '0', 'Test: client_connection_check_interval enable');
+$node->stop;
+
+##########################################################
+# TEST 2: GUC client_connection_check_interval: disabled #
+##########################################################
+
+$node->start;
+$node->psql('postgres', "$set_guc_off SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out);
+# Give time to the client to disconnect
+sleep 3;
+# Check if backend is still alive
+$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+is($is_alive, '1', 'Test: client_connection_check_interval disable');
+$node->stop;
+
+##########################################################
+# TEST 3: Using client_connection_check_interval when #
+# client connected using SSL #
+##########################################################
+
+if ($ENV{with_openssl} eq 'yes')
+{
+ # The client's private key must not be world-readable, so take a copy
+ # of the key stored in the code tree and update its permissions.
+ copy("../../ssl/ssl/client.key", "../../ssl/ssl/client_tmp.key");
+ chmod 0600, "../../ssl/ssl/client_tmp.key";
+ copy("../../ssl/ssl/client-revoked.key", "../../ssl/ssl/client-revoked_tmp.key");
+ chmod 0600, "../../ssl/ssl/client-revoked_tmp.key";
+ $ENV{PGHOST} = $node->host;
+ $ENV{PGPORT} = $node->port;
+
+ open my $sslconf, '>', $node->data_dir . "/sslconfig.conf";
+ print $sslconf "ssl=on\n";
+ print $sslconf "ssl_cert_file='server-cn-only.crt'\n";
+ print $sslconf "ssl_key_file='server-password.key'\n";
+ print $sslconf "ssl_passphrase_command='echo secret1'\n";
+ close $sslconf;
+
+ $node->start;
+ $node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query",
+ stdout => \$pid, timeout => 2, timed_out => \$timed_out,
+ sslmode => 'require');
+
+ # Give time to the backend to detect client disconnected
+ sleep 3;
+ # Check if backend is still alive
+ my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;");
+ is($is_alive, '0', 'Test: client_connection_check_interval enabled, SSL');
+ $node->stop;
+}
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index a184404e21..833918e373 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -51,7 +51,9 @@ my @contrib_excludes = (
'pgcrypto', 'sepgsql',
'brin', 'test_extensions',
'test_misc', 'test_pg_dump',
- 'snapshot_too_old', 'unsafe_tests');
+ 'snapshot_too_old', 'unsafe_tests',
+ 'connection'
+);
# Set of variables for frontend modules
my $frontend_defines = { 'initdb' => 'FRONTEND' };
--
2.30.1
On Mon, Mar 22, 2021 at 3:29 PM Thomas Munro <thomas.munro@gmail.com> wrote:
2. The tests need tightening up. The thing with the "sleep 3" will
not survive contact with the build farm, and I'm not sure if the SSL
test is as short as it could be.
I don't think the TAP test can be done in the way Sergey had it,
because of multiple races that would probably fail on
slow/overloaded/valgrind machines. I'll try to think of a better way,
but for now I've removed those tests.
I realised that this should really be testing DoingCommandRead to
decide when it's time to stop checking and re-arming (originally it
was checking PqReadingMessage, which isn't quite right), so I moved a
tiny bit more of the logic into postgres.c, keeping only the actual
connection-check in pqcomm.c.
That leaves the thorny problem Tom mentioned at the top of this
thread[1]/messages/by-id/19003.1547420739@sss.pgh.pa.us: this socket-level approach can be fooled by an 'X' sitting
in the socket buffer, if a client that did PQsendQuery() and then
PQfinish(). Or perhaps even by SSL messages invisible to our protocol
level. That can surely only be addressed by moving the 'peeking' one
level up the protocol stack. I've attached a WIP attemp to do that,
on top of the other patch. Lookahead happens in our receive buffer,
not the kernel's socket buffer. It detects the simple 'X' case, but
not deeper pipelines of queries (which would seem to require an
unbounded receive buffer and lookahead that actually decodes message
instead of just looking at the first byte, which seems way over the
top considering the price of infinite RAM these days). I think it's
probably safe in terms of protocol synchronisation because it consults
PqCommReadingMsg to avoid look at non-message-initial bytes, but I
could be wrong, it's a first swing at it... Maybe it's a little
unprincipled to bother with detecting 'X' at all if you can't handle
pipelining in general... I don't know.
Today I learned that there have been other threads[2]/messages/by-id/e09785e00907271728k4bf4d17kac0e7f5ec9316069@mail.gmail.com[3]/messages/by-id/20130810.113901.1014453099921841746.t-ishii@sraoss.co.jp with people
wanting some variant of this feature over the years.
[1]: /messages/by-id/19003.1547420739@sss.pgh.pa.us
[2]: /messages/by-id/e09785e00907271728k4bf4d17kac0e7f5ec9316069@mail.gmail.com
[3]: /messages/by-id/20130810.113901.1014453099921841746.t-ishii@sraoss.co.jp
Attachments:
v6-0001-Detect-dropped-connections-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v6-0001-Detect-dropped-connections-while-running-queries.patchDownload
From d57726134ad51b794b6db31d5824f21f6fd40214 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v6 1/2] Detect dropped connections while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 36 +++++++++++++
src/backend/libpq/pqcomm.c | 52 +++++++++++++++++++
src/backend/tcop/postgres.c | 27 ++++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 11 ++++
src/backend/utils/misc/guc.c | 10 ++++
src/backend/utils/misc/postgresql.conf.sample | 3 ++
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 1 +
src/include/utils/timeout.h | 1 +
10 files changed, 144 insertions(+)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5679b40dd5..5cd0d38dbf 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,42 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is performed by testing
+ whether one byte could be read from the socket with
+ <symbol>MSG_PEEK</symbol>. If the kernel reports that the connection
+ has been closed or lost, a long running query can abort immediately,
+ rather than discovering the problem when it eventually tries to send
+ the response.
+ </para>
+ <para>
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only when it is waiting for a new request, receiving a
+ request or sending a response.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and
+ within a known timeframe in all scenarios including network failure, it
+ may also be necessary to adjust the default TCP keepalive settings of
+ the operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-idle"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..74b309a1ef 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -79,6 +79,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#include "utils/timeout.h"
/*
* Cope with the various platform-specific ways to spell TCP keepalive socket
@@ -104,6 +105,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -1921,3 +1923,53 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/* --------------------------------
+ * pq_check_client_connection - is the client still connected?
+ * --------------------------------
+ */
+bool
+pq_check_client_connection(void)
+{
+ bool connected = true;
+ char nextbyte;
+ int r;
+
+retry:
+#ifdef WIN32
+ pgwin32_noblock = 1;
+#endif
+ r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
+#ifdef WIN32
+ pgwin32_noblock = 0;
+#endif
+
+ if (r == 0)
+ {
+ /* EOF detected. */
+ connected = false;
+ }
+ else if (r > 0)
+ {
+ /* Data available to read. Connection looks good. */
+ }
+ else if (errno == EINTR)
+ {
+ /* Interrupted by a signal, so retry. */
+ goto retry;
+ }
+ else if (errno == EAGAIN || errno == EWOULDBLOCK)
+ {
+ /* No data available to read. Connection looks good. */
+ }
+ else
+ {
+ /* Got some other error. We'd better log the reason. */
+ ereport(COMMERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("could not check client connection: %m")));
+ connected = false;
+ }
+
+ return connected;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..44fb7b7c30 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2671,6 +2671,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3157,25 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * If we're idle or reading a command, we can skip the explicit check
+ * for a lost connection (if configured), because that'll be detected
+ * by socket I/O routines, and a new check will be rescheduled if
+ * necessary when the command runs. We don't want needless wakeups of
+ * idle sessions.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ if (!pq_check_client_connection())
+ ClientConnectionLost = true;
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..0bac23e75d 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3b36a31a47..391d9983e0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of 0 disables this feature."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 86425965d0..dd850bf272 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # set time interval between
+ # checks for client disconnection while
+ # running long queries; 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..3bd97c4e93 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_client_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.30.1
v6-0002-WIP-use-secure_read-to-peek.patchtext/x-patch; charset=US-ASCII; name=v6-0002-WIP-use-secure_read-to-peek.patchDownload
From a0105faf49ade2c695523b88c927c402aae6eb53 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 23 Mar 2021 22:15:23 +1300
Subject: [PATCH v6 2/2] WIP -- use secure_read() to peek
XXX Just a sketch, probably has bugs
---
doc/src/sgml/config.sgml | 4 +-
src/backend/libpq/pqcomm.c | 84 ++++++++++++++++++-------------------
src/backend/tcop/postgres.c | 11 ++++-
src/include/libpq/libpq.h | 2 +-
4 files changed, 53 insertions(+), 48 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5cd0d38dbf..e522be460c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1008,8 +1008,8 @@ include_dir 'conf.d'
<para>
Sets the time interval between checks that the client is still
connected, while running queries. The check is performed by testing
- whether one byte could be read from the socket with
- <symbol>MSG_PEEK</symbol>. If the kernel reports that the connection
+ whether a part of the next message can be read from the client.
+ If the kernel reports that the connection
has been closed or lost, a long running query can abort immediately,
rather than discovering the problem when it eventually tries to send
the response.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 74b309a1ef..a763a1b535 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -921,13 +921,13 @@ socket_set_nonblocking(bool nonblocking)
}
/* --------------------------------
- * pq_recvbuf - load some bytes into the input buffer
+ * pq_recvbuf_ext - load some bytes into the input buffer
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int
-pq_recvbuf(void)
+pq_recvbuf_ext(bool nonblocking)
{
if (PqRecvPointer > 0)
{
@@ -943,8 +943,7 @@ pq_recvbuf(void)
PqRecvLength = PqRecvPointer = 0;
}
- /* Ensure that we're in blocking mode */
- socket_set_nonblocking(false);
+ socket_set_nonblocking(nonblocking);
/* Can fill buffer from PqRecvLength and upwards */
for (;;)
@@ -956,6 +955,9 @@ pq_recvbuf(void)
if (r < 0)
{
+ if (nonblocking && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return 0;
+
if (errno == EINTR)
continue; /* Ok if interrupted */
@@ -983,6 +985,13 @@ pq_recvbuf(void)
}
}
+static int
+pq_recvbuf(void)
+{
+ return pq_recvbuf_ext(false);
+}
+
+
/* --------------------------------
* pq_getbyte - get a single byte from connection, or return EOF
* --------------------------------
@@ -1924,52 +1933,39 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
-/* --------------------------------
- * pq_check_client_connection - is the client still connected?
- * --------------------------------
+/*
+ * Proactively check if the client connection has gone away. Return 0 if still
+ * connected or currently reading a message already, EOF if disconnected, and 1
+ * if at least one byte is available to read. If 1 is returned, the first
+ * byte of the next message is written to *c without consuming it, but we can't
+ * find out if the client has disconnected until we consume more data.
*/
-bool
-pq_check_client_connection(void)
+int
+pq_check_client_connection(unsigned char *c)
{
- bool connected = true;
- char nextbyte;
- int r;
-
-retry:
-#ifdef WIN32
- pgwin32_noblock = 1;
-#endif
- r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK);
-#ifdef WIN32
- pgwin32_noblock = 0;
-#endif
+ /* We're already in the middle of a message, so assume we are connected. */
+ if (PqCommReadingMsg)
+ return 0;
- if (r == 0)
- {
- /* EOF detected. */
- connected = false;
- }
- else if (r > 0)
- {
- /* Data available to read. Connection looks good. */
- }
- else if (errno == EINTR)
- {
- /* Interrupted by a signal, so retry. */
- goto retry;
- }
- else if (errno == EAGAIN || errno == EWOULDBLOCK)
+ /* Do we have a byte already in our read buffer? */
+ if (PqRecvPointer < PqRecvLength)
{
- /* No data available to read. Connection looks good. */
+ *c = PqRecvBuffer[PqRecvPointer];
+ return 1;
}
- else
+
+ /* Try to read at least one byte from secure_read() without blocking. */
+ if (pq_recvbuf_ext(true))
+ return EOF;
+
+ /* Now do we have a byte in our read buffer? */
+ if (PqRecvPointer < PqRecvLength)
{
- /* Got some other error. We'd better log the reason. */
- ereport(COMMERROR,
- (errcode(ERRCODE_CONNECTION_EXCEPTION),
- errmsg("could not check client connection: %m")));
- connected = false;
+ *c = PqRecvBuffer[PqRecvPointer];
+ return 1;
}
- return connected;
+ /* No data, but we're still connected. */
+
+ return 0;
}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 44fb7b7c30..23d1ee84da 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3170,7 +3170,16 @@ ProcessInterrupts(void)
*/
if (!DoingCommandRead && client_connection_check_interval > 0)
{
- if (!pq_check_client_connection())
+ unsigned char peekbyte;
+ int r;
+
+ /*
+ * Try to peek ahead to see if we've been disconnected. If we see
+ * a pipelined 'X' message, we'll treat that as a disconnection
+ * too, but we don't try to look further ahead than that.
+ */
+ r = pq_check_client_connection(&peekbyte);
+ if (r == EOF || (r == 1 && peekbyte == 'X'))
ClientConnectionLost = true;
enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
client_connection_check_interval);
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 3bd97c4e93..f6cd3ee067 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,7 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
-extern bool pq_check_client_connection(void);
+extern int pq_check_client_connection(unsigned char *c);
/*
* prototypes for functions in be-secure.c
--
2.30.1
On Tue, Mar 23, 2021 at 11:47 PM Thomas Munro <thomas.munro@gmail.com> wrote:
That leaves the thorny problem Tom mentioned at the top of this
thread[1]: this socket-level approach can be fooled by an 'X' sitting
in the socket buffer, if a client that did PQsendQuery() and then
PQfinish(). Or perhaps even by SSL messages invisible to our protocol
level. That can surely only be addressed by moving the 'peeking' one
level up the protocol stack. I've attached a WIP attemp to do that,
on top of the other patch. Lookahead happens in our receive buffer,
not the kernel's socket buffer.
After sleeping on this, I'm still not seeing any problem with this
approach. Sanity checks welcome. Of course that function should be
called something like pq_peekmessage() -- done. I think this patch
addresses all critiques leveled at the earlier versions, and I've
tested this with SSL and non-SSL connections, by killing psql while a
query runs, and using a client that calls PQfinish() after starting a
query, and in an earlier version I did yank-the-cable testing, having
set up TCP keepalive to make that last case work.
Attachments:
v7-0001-Detect-dropped-connections-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v7-0001-Detect-dropped-connections-while-running-queries.patchDownload
From f7fd8640ebac242f21719574b0e92dc7cfba4041 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v7] Detect dropped connections while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 36 +++++++++++
src/backend/libpq/pqcomm.c | 60 +++++++++++++++++--
src/backend/tcop/postgres.c | 34 +++++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 11 ++++
src/backend/utils/misc/guc.c | 10 ++++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 1 +
src/include/utils/timeout.h | 1 +
10 files changed, 155 insertions(+), 4 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5679b40dd5..e522be460c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,42 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is performed by testing
+ whether a part of the next message can be read from the client.
+ If the kernel reports that the connection
+ has been closed or lost, a long running query can abort immediately,
+ rather than discovering the problem when it eventually tries to send
+ the response.
+ </para>
+ <para>
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only when it is waiting for a new request, receiving a
+ request or sending a response.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and
+ within a known timeframe in all scenarios including network failure, it
+ may also be necessary to adjust the default TCP keepalive settings of
+ the operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-idle"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..a12ed3f851 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -79,6 +79,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#include "utils/timeout.h"
/*
* Cope with the various platform-specific ways to spell TCP keepalive socket
@@ -104,6 +105,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -919,13 +921,13 @@ socket_set_nonblocking(bool nonblocking)
}
/* --------------------------------
- * pq_recvbuf - load some bytes into the input buffer
+ * pq_recvbuf_ext - load some bytes into the input buffer
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int
-pq_recvbuf(void)
+pq_recvbuf_ext(bool nonblocking)
{
if (PqRecvPointer > 0)
{
@@ -941,8 +943,7 @@ pq_recvbuf(void)
PqRecvLength = PqRecvPointer = 0;
}
- /* Ensure that we're in blocking mode */
- socket_set_nonblocking(false);
+ socket_set_nonblocking(nonblocking);
/* Can fill buffer from PqRecvLength and upwards */
for (;;)
@@ -954,6 +955,9 @@ pq_recvbuf(void)
if (r < 0)
{
+ if (nonblocking && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return 0;
+
if (errno == EINTR)
continue; /* Ok if interrupted */
@@ -981,6 +985,13 @@ pq_recvbuf(void)
}
}
+static int
+pq_recvbuf(void)
+{
+ return pq_recvbuf_ext(false);
+}
+
+
/* --------------------------------
* pq_getbyte - get a single byte from connection, or return EOF
* --------------------------------
@@ -1921,3 +1932,44 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/*
+ * Peek at the first byte of the next message from the client, without
+ * consuming it.
+ *
+ * Return 0 if there isn't at least one byte of a new message in our receive
+ * buffer or the socket yet, or if we're in the middle of reading a message
+ * already so we can't see the next message yet.
+ *
+ * Return EOF if the connection is closed.
+ *
+ * Return 1 if there is at least one byte of data available from the start of
+ * the next messag, and write that byte into *c.
+ */
+int
+pq_peekmessage(unsigned char *c)
+{
+ /* We're already in the middle of a message. */
+ if (PqCommReadingMsg)
+ return 0;
+
+ /* Do we have a byte already in our receive buffer? */
+ if (PqRecvPointer < PqRecvLength)
+ {
+ *c = PqRecvBuffer[PqRecvPointer];
+ return 1;
+ }
+
+ /* Try to read at least one byte from secure_read() without blocking. */
+ if (pq_recvbuf_ext(true))
+ return EOF;
+
+ /* Now do we have a byte in our receive buffer? */
+ if (PqRecvPointer < PqRecvLength)
+ {
+ *c = PqRecvBuffer[PqRecvPointer];
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..aab90a3a78 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2671,6 +2671,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3157,32 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * Check for lost connection and re-arm, if still configured, but not
+ * if we've arrived back at DoingCommandRead state. We don't want to
+ * wake up idle sessions, and they already know how to detect lost
+ * connections.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ unsigned char next_message;
+ int r;
+
+ /*
+ * Does the kernel think we have been disconnected, or is there a
+ * pipelined terminate message from the client?
+ */
+ r = pq_peekmessage(&next_message);
+ if (r == EOF || (r == 1 && next_message == 'X'))
+ ClientConnectionLost = true;
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..0bac23e75d 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3b36a31a47..391d9983e0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of 0 disables this feature."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 86425965d0..dd850bf272 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # set time interval between
+ # checks for client disconnection while
+ # running long queries; 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..a2255cc0da 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern int pq_peekmessage(unsigned char *c);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.30.1
Hi,
In the description:
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
I think moving 'periodically' to the vicinity of 'to check' would make the
sentence more readable.
+ the operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-idle"/> and
The same guc is listed twice. I am not sure if that was intended.
Cheers
On Tue, Mar 23, 2021 at 2:54 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Show quoted text
On Tue, Mar 23, 2021 at 11:47 PM Thomas Munro <thomas.munro@gmail.com>
wrote:That leaves the thorny problem Tom mentioned at the top of this
thread[1]: this socket-level approach can be fooled by an 'X' sitting
in the socket buffer, if a client that did PQsendQuery() and then
PQfinish(). Or perhaps even by SSL messages invisible to our protocol
level. That can surely only be addressed by moving the 'peeking' one
level up the protocol stack. I've attached a WIP attemp to do that,
on top of the other patch. Lookahead happens in our receive buffer,
not the kernel's socket buffer.After sleeping on this, I'm still not seeing any problem with this
approach. Sanity checks welcome. Of course that function should be
called something like pq_peekmessage() -- done. I think this patch
addresses all critiques leveled at the earlier versions, and I've
tested this with SSL and non-SSL connections, by killing psql while a
query runs, and using a client that calls PQfinish() after starting a
query, and in an earlier version I did yank-the-cable testing, having
set up TCP keepalive to make that last case work.
Going back a couple of years to something Konstantin said:
On Sat, Aug 3, 2019 at 4:40 AM Konstantin Knizhnik
<k.knizhnik@postgrespro.ru> wrote:
But I wonder why we can not perform just pool with POLLOUT flag and zero
timeout.
If OS detected closed connection, it should return POLLHUP, should not it?
I am not sure if it is more portable or more efficient way - just seems
to be a little bit more natural way (from my point of view) to check if
connection is still alive.
... Andres just asked me the same question, when we were discussing
the pq_peekmessage() patch (v7). I had remembered that POLLHUP didn't
work for this type of thing, from some earlier attempt at something
similar, and indeed on my first attempt to do that here as an
alternative design, it did not work... with TCP sockets (localhost)...
though it did work with Unix sockets. Gah! Then he pointed me at
POLLRDHUP (a Linux only extension) and that did seem to work in all
cases I tried. But without that, this v8 patch doesn't seem to work
on FreeBSD (TCP), and for the rest of the menagerie, who knows?
Here's a sketch patch like that for discussion.
It's frustrating, because this patch is so simple, and doesn't have
v7's problem with pipelined queries. Hmm.
(I tried to make it work on Windows too by reading the manual, no idea
if that part compiles or works).
Attachments:
v8-0001-Detect-POLLHUP-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v8-0001-Detect-POLLHUP-while-running-queries.patchDownload
From 49948a11ebac1d835735ccedb9e0e2484d1cc13f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 1 Mar 2021 18:08:23 +1300
Subject: [PATCH v8] Detect POLLHUP while running queries.
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 32 ++++++++++
src/backend/libpq/pqcomm.c | 64 +++++++++++++++++--
src/backend/tcop/postgres.c | 27 ++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 11 ++++
src/backend/utils/misc/guc.c | 10 +++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 1 +
src/include/utils/timeout.h | 1 +
10 files changed, 148 insertions(+), 4 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5679b40dd5..57a174a192 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,38 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is perform by polling
+ the socket, to allows long running queries to be aborted immediately.
+ </para>
+ <para>
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only while it is waiting for a new request, receiving a
+ request or sending a response.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and
+ within a known timeframe in all scenarios including network failure, it
+ may also be necessary to adjust the default TCP keepalive settings of
+ the operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-interval"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..00901ffe9b 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -59,6 +59,9 @@
#include <grp.h>
#include <unistd.h>
#include <sys/file.h>
+#ifdef HAVE_POLL_H
+#include <sys/poll.h>
+#endif
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
@@ -79,6 +82,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#include "utils/timeout.h"
/*
* Cope with the various platform-specific ways to spell TCP keepalive socket
@@ -104,6 +108,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -919,13 +924,13 @@ socket_set_nonblocking(bool nonblocking)
}
/* --------------------------------
- * pq_recvbuf - load some bytes into the input buffer
+ * pq_recvbuf_ext - load some bytes into the input buffer
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int
-pq_recvbuf(void)
+pq_recvbuf_ext(bool nonblocking)
{
if (PqRecvPointer > 0)
{
@@ -941,8 +946,7 @@ pq_recvbuf(void)
PqRecvLength = PqRecvPointer = 0;
}
- /* Ensure that we're in blocking mode */
- socket_set_nonblocking(false);
+ socket_set_nonblocking(nonblocking);
/* Can fill buffer from PqRecvLength and upwards */
for (;;)
@@ -954,6 +958,9 @@ pq_recvbuf(void)
if (r < 0)
{
+ if (nonblocking && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return 0;
+
if (errno == EINTR)
continue; /* Ok if interrupted */
@@ -981,6 +988,13 @@ pq_recvbuf(void)
}
}
+static int
+pq_recvbuf(void)
+{
+ return pq_recvbuf_ext(false);
+}
+
+
/* --------------------------------
* pq_getbyte - get a single byte from connection, or return EOF
* --------------------------------
@@ -1921,3 +1935,45 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/*
+ * POLLRDHUP is a Linux extension to poll(2). Unlike POLLHUP, it has to be
+ * requested explicitly. It detects half-shutdown sockets, which are not
+ * always reported as POLLHUP on Linux, depending on the type of socket. We'll
+ * look out for both.
+ */
+#ifdef POLLRDHUP
+#define PG_POLLRDHUP POLLRDHUP
+#else
+#define PG_POLLRDHUP 0
+#endif
+
+/*
+ * Check if the client is still connected.
+ */
+bool
+pq_check_connection(void)
+{
+#if defined(HAVE_POLL) || defined(WIN32)
+ struct pollfd pollfd;
+ int rc;
+
+ pollfd.fd = MyProcPort->sock;
+ pollfd.events = POLLOUT | POLLIN | PG_POLLRDHUP;
+ pollfd.revents = 0;
+#ifdef WIN32
+ rc = WSAPoll(&pollfd, 1, 0);
+#else
+ rc = poll(&pollfd, 1, 0);
+#endif
+ if (rc < 0)
+ {
+ elog(COMMERROR, "could not poll socket: %m");
+ return false;
+ }
+ else if (rc == 1 && (pollfd.revents & (POLLHUP | PG_POLLRDHUP)))
+ return false;
+#endif
+
+ return true;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..b04573cd43 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2671,6 +2671,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3157,25 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * Check for lost connection and re-arm, if still configured, but not
+ * if we've arrived back at DoingCommandRead state. We don't want to
+ * wake up idle sessions, and they already know how to detect lost
+ * connections.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ if (!pq_check_connection())
+ ClientConnectionLost = true;
+ else
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..0bac23e75d 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -34,6 +34,7 @@
#include "catalog/pg_db_role_setting.h"
#include "catalog/pg_tablespace.h"
#include "libpq/auth.h"
+#include "libpq/libpq.h"
#include "libpq/libpq-be.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
@@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3b36a31a47..391d9983e0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of 0 disables this feature."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 86425965d0..dd850bf272 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # set time interval between
+ # checks for client disconnection while
+ # running long queries; 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..fac233c9f4 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.30.1
Hi,
On 2021-03-24 16:08:13 +1300, Thomas Munro wrote:
... Andres just asked me the same question, when we were discussing
the pq_peekmessage() patch (v7). I had remembered that POLLHUP didn't
work for this type of thing, from some earlier attempt at something
similar, and indeed on my first attempt to do that here as an
alternative design, it did not work... with TCP sockets (localhost)...
though it did work with Unix sockets. Gah! Then he pointed me at
POLLRDHUP (a Linux only extension) and that did seem to work in all
cases I tried. But without that, this v8 patch doesn't seem to work
on FreeBSD (TCP), and for the rest of the menagerie, who knows?
Here's a sketch patch like that for discussion.
It's frustrating, because this patch is so simple, and doesn't have
v7's problem with pipelined queries. Hmm.
It is indeed frustrating. I searched a bit for other OSs and POLLRDHUP
and I'm annoyed by responses from various OS folks of "You don't need
that, just read the data upon POLLIN...".
I don't like the feature not handling pipelining etc, nor does working
undetectedly only on linux seem like a great answer. I guess we could
have the template files tell us wether it work, or configure test it,
but brrr.
I'm mostly joking, and I've not read the thread, but I assume just
sending an empty NoticeResponse or error message has been brought up and
laughed out of the room?
(I tried to make it work on Windows too by reading the manual, no idea
if that part compiles or works).
If the patch had tests, cfbot might tell you :)
Greetings,
Andres Freund
Hi Thomas! Thanks for working on this patch.
I have attached a new version with some typo corrections of doc entry,
removing of redundant `include` entries and trailing whitespaces. Also I
added in doc the case when single query transaction with disconnected
client might be eventually commited upon completion in autocommit mode
if it doesn't return any rows (doesn't communicate with user) before
sending final commit message. This behavior might be unexpected for
clients and hence IMO it's worth noticing.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 4c7b1e7bfd..8cf95d09a4 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c@@ -919,13 +923,13 @@ socket_set_nonblocking(bool nonblocking)
}/* -------------------------------- - * pq_recvbuf - load some bytes into the input buffer + * pq_recvbuf_ext - load some bytes into the input buffer * * returns 0 if OK, EOF if trouble * -------------------------------- */ static int -pq_recvbuf(void) +pq_recvbuf_ext(bool nonblocking) { if (PqRecvPointer > 0) { @@ -941,8 +945,7 @@ pq_recvbuf(void) PqRecvLength = PqRecvPointer = 0; }- /* Ensure that we're in blocking mode */ - socket_set_nonblocking(false); + socket_set_nonblocking(nonblocking);/* Can fill buffer from PqRecvLength and upwards */
for (;;)
@@ -954,6 +957,9 @@ pq_recvbuf(void)if (r < 0)
{
+ if (nonblocking && (errno == EAGAIN || errno ==
EWOULDBLOCK))
+ return 0; + if (errno == EINTR) continue; /* Ok if interrupted */@@ -981,6 +987,13 @@ pq_recvbuf(void)
}
}+static int +pq_recvbuf(void) +{ + return pq_recvbuf_ext(false); +} + +
AFAICS, the above fragment is not related with primary fix directly.
AFAICS, there are the following open items that don't allow to treat the
current patch completed:
* Absence of tap tests emulating some scenarios of client disconnection.
IIRC, you wanted to rewrite the test case posted by Sergey.
* Concerns about portability of `pq_check_connection()`A implementation.
BTW, on windows postgres with this patch have not been built [1].
* Absence of benchmark results to show lack of noticeable performance
regression after applying non-zero timeout for checking client liveness.
1.
https://ci.appveyor.com/project/postgresql-cfbot/postgresql/build/1.0.131820
--
Regards,
Maksim Milyutin
Attachments:
v9-0001-Detect-POLLHUP-while-running-queries.patchtext/x-patch; charset=UTF-8; name=v9-0001-Detect-POLLHUP-while-running-queries.patchDownload
From 3ec788ac5e7c47fe135a3618849db179942f4b27 Mon Sep 17 00:00:00 2001
From: Maksim Milyutin <milyutinma@gmail.com>
Date: Fri, 26 Mar 2021 10:18:30 +0300
Subject: [PATCH v9] Detect POLLHUP while running queries
Provide a new optional GUC that can be used to check whether the client
connection has gone away periodically while running very long queries.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 36 +++++++++++
src/backend/libpq/pqcomm.c | 63 +++++++++++++++++--
src/backend/tcop/postgres.c | 27 ++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 10 +++
src/backend/utils/misc/guc.c | 10 +++
src/backend/utils/misc/postgresql.conf.sample | 3 +
src/include/libpq/libpq.h | 2 +
src/include/miscadmin.h | 1 +
src/include/utils/timeout.h | 1 +
10 files changed, 150 insertions(+), 4 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ddc6d789d8..abec47ada7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,42 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is performed by polling the
+ socket to allows long running queries to be aborted immediately.
+ </para>
+ <para>
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only after next interaction with client through
+ connection socket: when it starts to wait for a new request, receives a
+ request or sends a response. As a consequence, some modificatory single
+ query transaction in autocommit mode that doesn't return any rows
+ becomes commited somewhen later silently and unexpectedly for previously
+ disconnected client.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and within
+ a known timeframe in all scenarios including network failure, it may
+ also be necessary to adjust the default TCP keepalive settings of the
+ operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-interval"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..8cf95d09a4 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -59,6 +59,9 @@
#include <grp.h>
#include <unistd.h>
#include <sys/file.h>
+#ifdef HAVE_POLL_H
+#include <sys/poll.h>
+#endif
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
@@ -104,6 +107,7 @@
*/
int Unix_socket_permissions;
char *Unix_socket_group;
+int client_connection_check_interval;
/* Where the Unix socket files are (list of palloc'd strings) */
static List *sock_paths = NIL;
@@ -919,13 +923,13 @@ socket_set_nonblocking(bool nonblocking)
}
/* --------------------------------
- * pq_recvbuf - load some bytes into the input buffer
+ * pq_recvbuf_ext - load some bytes into the input buffer
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int
-pq_recvbuf(void)
+pq_recvbuf_ext(bool nonblocking)
{
if (PqRecvPointer > 0)
{
@@ -941,8 +945,7 @@ pq_recvbuf(void)
PqRecvLength = PqRecvPointer = 0;
}
- /* Ensure that we're in blocking mode */
- socket_set_nonblocking(false);
+ socket_set_nonblocking(nonblocking);
/* Can fill buffer from PqRecvLength and upwards */
for (;;)
@@ -954,6 +957,9 @@ pq_recvbuf(void)
if (r < 0)
{
+ if (nonblocking && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return 0;
+
if (errno == EINTR)
continue; /* Ok if interrupted */
@@ -981,6 +987,13 @@ pq_recvbuf(void)
}
}
+static int
+pq_recvbuf(void)
+{
+ return pq_recvbuf_ext(false);
+}
+
+
/* --------------------------------
* pq_getbyte - get a single byte from connection, or return EOF
* --------------------------------
@@ -1921,3 +1934,45 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/*
+ * POLLRDHUP is a Linux extension to poll(2). Unlike POLLHUP, it has to be
+ * requested explicitly. It detects half-shutdown sockets, which are not
+ * always reported as POLLHUP on Linux, depending on the type of socket. We'll
+ * look out for both.
+ */
+#ifdef POLLRDHUP
+#define PG_POLLRDHUP POLLRDHUP
+#else
+#define PG_POLLRDHUP 0
+#endif
+
+/*
+ * Check if the client is still connected.
+ */
+bool
+pq_check_connection(void)
+{
+#if defined(HAVE_POLL) || defined(WIN32)
+ struct pollfd pollfd;
+ int rc;
+
+ pollfd.fd = MyProcPort->sock;
+ pollfd.events = POLLOUT | POLLIN | PG_POLLRDHUP;
+ pollfd.revents = 0;
+#ifdef WIN32
+ rc = WSAPoll(&pollfd, 1, 0);
+#else
+ rc = poll(&pollfd, 1, 0);
+#endif
+ if (rc < 0)
+ {
+ elog(COMMERROR, "could not poll socket: %m");
+ return false;
+ }
+ else if (rc == 1 && (pollfd.revents & (POLLHUP | PG_POLLRDHUP)))
+ return false;
+#endif
+
+ return true;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..b04573cd43 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2671,6 +2671,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3157,25 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * Check for lost connection and re-arm, if still configured, but not
+ * if we've arrived back at DoingCommandRead state. We don't want to
+ * wake up idle sessions, and they already know how to detect lost
+ * connections.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ if (!pq_check_connection())
+ ClientConnectionLost = true;
+ else
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..a3ec358538 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -73,6 +73,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +621,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1244,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 0c5dc4d3e8..9f67e8140e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3481,6 +3481,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval for checking connection with the client."),
+ gettext_noop("A value of 0 disables this feature."),
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b234a6bfe6..9691cb86f1 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -717,6 +717,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # set time interval between
+ # checks for client disconnection while
+ # running long queries; 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..fac233c9f4 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_connection(void);
/*
* prototypes for functions in be-secure.c
@@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port);
extern char *SSLCipherSuites;
extern char *SSLECDHCurve;
extern bool SSLPreferServerCiphers;
+extern int client_connection_check_interval;
extern int ssl_min_protocol_version;
extern int ssl_max_protocol_version;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.25.1
On Tue, Mar 30, 2021 at 6:25 AM Maksim Milyutin <milyutinma@gmail.com> wrote:
Hi Thomas! Thanks for working on this patch.
I have attached a new version with some typo corrections of doc entry,
removing of redundant `include` entries and trailing whitespaces. Also I
added in doc the case when single query transaction with disconnected
client might be eventually commited upon completion in autocommit mode
if it doesn't return any rows (doesn't communicate with user) before
sending final commit message. This behavior might be unexpected for
clients and hence IMO it's worth noticing.
Thanks!
+ * pq_recvbuf_ext - load some bytes into the input buffer
AFAICS, the above fragment is not related with primary fix directly.
Right, that was some stuff left over from the v7 patch that I forgot to remove.
AFAICS, there are the following open items that don't allow to treat the
current patch completed:* Absence of tap tests emulating some scenarios of client disconnection.
IIRC, you wanted to rewrite the test case posted by Sergey.
Yeah, it's a bit tricky to write a test that won't fail randomly on
slow animals, but I think I see how now (I think you need to send
SELECT pg_sleep(60), then in another connection, wait until that shows
as running in pg_stat_activity, and then kill the first process, and
then wait until the log contains a disconnected-for-this-reason
message). If we can answer the big question below, I'll invest the
time to do this.
* Concerns about portability of `pq_check_connection()`A implementation.
Yes. This is the main problem for me. I just did a bit of testing to
refresh my memory of what doesn't work. On FreeBSD, with TCP
keepalives enabled (I tested with tcp_keepalives_idle=1s,
tcp_keepalives_count=5, tcp_keepalives_interval=1s to make it
aggressive), the POLLUP patch works if you pull out the ethernet
cable. But... it doesn't work if you kill -9 the remote psql process
(the remote kernel sends a FIN). So this is a patch that only works
the way we need it to work on Linux. Other OSes can only tell you
about this condition when you try to read or write.
BTW, on windows postgres with this patch have not been built [1].
Huh, the manual says they have struct pollfd, and we include
winsock2.h. But I guess it doesn't work reliably on Windows anyway.
https://docs.microsoft.com/en-us/windows/win32/api/winsock2/ns-winsock2-wsapollfd
* Absence of benchmark results to show lack of noticeable performance
regression after applying non-zero timeout for checking client liveness.
True, we should do that, but I'll be very surprised if it's still a
problem: the regression was caused by calling setitimer() for every
query, but we fixed that in commit 09cf1d52.
If we want to ship this in v14 we have to make a decision ASAP:
1. Ship the POLLHUP patch (like v9) that only works reliably on
Linux. Maybe disable the feature completely on other OSes?
2. Ship the patch that tries to read (like v7). It should work on
all systems, but it can be fooled by pipelined commands (though it can
detect a pipelined 'X').
Personally, I lean towards #2.
A third option was mentioned, but we have no patch:
3. We could try to *send* a message. This should work on all
systems. Unfortunately our protocol doesn't seem to have an existing
"no-op" or "heartbeat" message we could use for this, and sending a
NOTICE would be noisy. I suspect this would require some protocol
evolution, and there is no time for that in v14.
On Tue, Mar 30, 2021 at 10:00 AM Thomas Munro <thomas.munro@gmail.com> wrote:
If we want to ship this in v14 we have to make a decision ASAP:
1. Ship the POLLHUP patch (like v9) that only works reliably on
Linux. Maybe disable the feature completely on other OSes?
2. Ship the patch that tries to read (like v7). It should work on
all systems, but it can be fooled by pipelined commands (though it can
detect a pipelined 'X').Personally, I lean towards #2.
I changed my mind. Let's commit the pleasingly simple Linux-only
feature for now, and extend to it to send some kind of no-op message
in a later release. So this is the version I'd like to go with.
Objections?
I moved the GUC into tcop/postgres.c and tcop/tcopprot.h, because it
directly controls postgres.c's behaviour, not pqcomm.c's. The latter
only contains the code to perform the check.
Attachments:
v10-0001-Detect-POLLHUP-POLLRDHUP-while-running-queries.patchtext/x-patch; charset=US-ASCII; name=v10-0001-Detect-POLLHUP-POLLRDHUP-while-running-queries.patchDownload
From 1ad04e414d66ee23ce0612831648a25902661731 Mon Sep 17 00:00:00 2001
From: Maksim Milyutin <milyutinma@gmail.com>
Date: Fri, 26 Mar 2021 10:18:30 +0300
Subject: [PATCH v10] Detect POLLHUP/POLLRDHUP while running queries.
Provide a new optional GUC check_client_connection_interval that can be
used to check whether the client connection has gone away periodically
while running very long queries.
For now this is Linux-only, because POLLRDHUP is not in POSIX and other
OSes don't have a reliable way to know if a connection was shut down
without actually trying to read or write. In future we might extend
this to other OSes by trying to send a no-op/heartbeat message, but
that may require protocol changes.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 35 ++++++++++++++++++
src/backend/libpq/pqcomm.c | 36 +++++++++++++++++++
src/backend/tcop/postgres.c | 30 ++++++++++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 10 ++++++
src/backend/utils/misc/guc.c | 26 ++++++++++++++
src/backend/utils/misc/postgresql.conf.sample | 3 ++
src/include/libpq/libpq.h | 1 +
src/include/miscadmin.h | 1 +
src/include/tcop/tcopprot.h | 1 +
src/include/utils/timeout.h | 1 +
11 files changed, 145 insertions(+)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d1e2e8c4c3..bc1a42fa91 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,41 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between checks that the client is still
+ connected, while running queries. The check is performed by polling
+ the socket to allow long running queries to be aborted immediately.
+ This option is currently available only on Linux, because it uses the
+ <symbol>POLLRDHUP</symbol> extension to the <symbol>poll</symbol>
+ system call.
+ </para>
+ <para>
+ If the value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only at the next interaction with the socket, while
+ waiting for or receiving a new request, or sending a response.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and within
+ a known timeframe in all scenarios including network failure, it may
+ also be necessary to adjust the default TCP keepalive settings of the
+ operating system, or the
+ <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-interval"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..c4a67a27d2 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -54,6 +54,9 @@
*/
#include "postgres.h"
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
#include <signal.h>
#include <fcntl.h>
#include <grp.h>
@@ -1921,3 +1924,36 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/*
+ * Check if the client is still connected.
+ */
+bool
+pq_check_connection(void)
+{
+#if defined(POLLRDHUP)
+ /*
+ * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
+ * the other end. We don't have a portable way to do that without actually
+ * trying to read or write data on other systems. We don't want to read
+ * because that would be confused by buffered pipelined queries and COPY
+ * data. Perhaps in future we'll try to write a heartbeat message instead.
+ */
+ struct pollfd pollfd;
+ int rc;
+
+ pollfd.fd = MyProcPort->sock;
+ pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
+ pollfd.revents = 0;
+ rc = poll(&pollfd, 1, 0);
+ if (rc < 0)
+ {
+ elog(COMMERROR, "could not poll socket: %m");
+ return false;
+ }
+ else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
+ return false;
+#endif
+
+ return true;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..481c697072 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -102,6 +102,9 @@ int max_stack_depth = 100;
/* wait N seconds to allow attach from a debugger */
int PostAuthDelay = 0;
+/* Time between checks that the client is still connected. */
+int client_connection_check_interval = 0;
+
/* ----------------
* private typedefs etc
* ----------------
@@ -2671,6 +2674,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3160,25 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * Check for lost connection and re-arm, if still configured, but not
+ * if we've arrived back at DoingCommandRead state. We don't want to
+ * wake up idle sessions, and they already know how to detect lost
+ * connections.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ if (!pq_check_connection())
+ ClientConnectionLost = true;
+ else
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..a3ec358538 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -73,6 +73,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +621,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1244,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 03daec9a08..ebebee3915 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -20,6 +20,7 @@
#include <float.h>
#include <math.h>
#include <limits.h>
+#include <poll.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
@@ -204,6 +205,7 @@ static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource sourc
static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_huge_page_size(int *newval, void **extra, GucSource source);
+static bool check_client_connection_check_interval(int *newval, void **extra, GucSource source);
static void assign_pgstat_temp_directory(const char *newval, void *extra);
static bool check_application_name(char **newval, void **extra, GucSource source);
static void assign_application_name(const char *newval, void *extra);
@@ -3491,6 +3493,16 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval between checks for disconnection while running queries."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ check_client_connection_check_interval, NULL, NULL
+ },
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -11970,6 +11982,20 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
return true;
}
+static bool
+check_client_connection_check_interval(int *newval, void **extra, GucSource source)
+{
+#ifndef POLLRDHUP
+ /* Linux only, for now. See pq_check_connection(). */
+ if (*newval != 0)
+ {
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ return false;
+ }
+#endif
+ return true;
+}
+
static void
assign_pgstat_temp_directory(const char *newval, void *extra)
{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 791d39cf07..a799c0faa1 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -718,6 +718,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # time between checks for client
+ # disconnection while running queries;
+ # 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..3ebbc8d665 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_connection(void);
/*
* prototypes for functions in be-secure.c
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index e5472100a4..241e7c9961 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -29,6 +29,7 @@ extern CommandDest whereToSendOutput;
extern PGDLLIMPORT const char *debug_query_string;
extern int max_stack_depth;
extern int PostAuthDelay;
+extern int client_connection_check_interval;
/* GUC-configurable parameters */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.30.2
From: Thomas Munro <thomas.munro@gmail.com>
I changed my mind. Let's commit the pleasingly simple Linux-only feature for
now, and extend to it to send some kind of no-op message in a later release.
So this is the version I'd like to go with.
Objections?
+1, as some of our users experienced the problem that the server kept processing (IIRC, a buggy PL/pgSQL procedure that loops indefinitely) after they killed the client.
TBH, Linux and Windows will be sufficient. But I'm for providing a good feature on a specific OS first.
(1)
+ rc = poll(&pollfd, 1, 0);
+ if (rc < 0)
+ {
+ elog(COMMERROR, "could not poll socket: %m");
+ return false;
I think it's customary to use ereport() and errcode_for_socket_access().
(2)
pq_check_connection()
Following PostmasterIsAlive(), maybe it's better to name it as pq_connection_is_alive() pq_client_is_alive(), or pq_frontend_is_alive() like the pqcomm.c's head comment uses the word frontend?
(3)
#include <limits.h>
+#include <poll.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
poll.h should be included between #ifndef WIN32 and #endif?
(4)
I think the new GUC works for walsender as well. If so, how do we explain the relationship between the new GUC and wal_receiver_timeout and recommend the settings of them?
Regards
Takayuki Tsunakawa
On Thu, Apr 1, 2021 at 11:29 AM Thomas Munro <thomas.munro@gmail.com> wrote:
On Tue, Mar 30, 2021 at 10:00 AM Thomas Munro <thomas.munro@gmail.com> wrote:
If we want to ship this in v14 we have to make a decision ASAP:
1. Ship the POLLHUP patch (like v9) that only works reliably on
Linux. Maybe disable the feature completely on other OSes?
2. Ship the patch that tries to read (like v7). It should work on
all systems, but it can be fooled by pipelined commands (though it can
detect a pipelined 'X').Personally, I lean towards #2.
I changed my mind. Let's commit the pleasingly simple Linux-only
feature for now, and extend to it to send some kind of no-op message
in a later release. So this is the version I'd like to go with.
Objections?I moved the GUC into tcop/postgres.c and tcop/tcopprot.h, because it
directly controls postgres.c's behaviour, not pqcomm.c's. The latter
only contains the code to perform the check.
Here's a minor comment: it would be good if we have an extra line
after variable assignments, before and after function calls/if
clauses, something like
+ pollfd.revents = 0;
+
+ rc = poll(&pollfd, 1, 0);
+
+ if (rc < 0)
And also here
}
+
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
And
+ }
+ }
+
if (ClientConnectionLost)
And
+ 0, 0, INT_MAX,
+ check_client_connection_check_interval, NULL, NULL
+ },
+
/* End-of-list marker */
With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com
On Thu, Apr 1, 2021 at 10:16 PM tsunakawa.takay@fujitsu.com
<tsunakawa.takay@fujitsu.com> wrote:
From: Thomas Munro <thomas.munro@gmail.com>
I changed my mind. Let's commit the pleasingly simple Linux-only feature for
now, and extend to it to send some kind of no-op message in a later release.
So this is the version I'd like to go with.
Objections?+1, as some of our users experienced the problem that the server kept processing (IIRC, a buggy PL/pgSQL procedure that loops indefinitely) after they killed the client.
Cool. Yeah, I have seen a few variants of that, and several other
complaints on the lists.
TBH, Linux and Windows will be sufficient. But I'm for providing a good feature on a specific OS first.
I discovered that at least one other OS has adopted POLLRDHUP, so I
changed the language to something slightly more general:
+ <para>
+ This option is currently available only on systems that support the
+ non-standard <symbol>POLLRDHUP</symbol> extension to the
+ <symbol>poll</symbol> system call, including Linux.
+ </para>
It seems like it must be quite easy for an OS to implement, since the
TCP stack surely has the information... it's just an API problem.
Hopefully that means that there aren't OSes that define the macro but
don't work the same way. (I read somewhere that the POSIX compliance
test suite explicitly tests this half-shutdown case and fails any OS
that returns SIGHUP "prematurely". Boo.)
(1) + rc = poll(&pollfd, 1, 0); + if (rc < 0) + { + elog(COMMERROR, "could not poll socket: %m"); + return false;I think it's customary to use ereport() and errcode_for_socket_access().
Fixed.
(2)
pq_check_connection()Following PostmasterIsAlive(), maybe it's better to name it as pq_connection_is_alive() pq_client_is_alive(), or pq_frontend_is_alive() like the pqcomm.c's head comment uses the word frontend?
I think it's OK, because it matches the name of the GUC. I'm more
concerned about the name of the GUC. Will we still be happy with this
name if a future releases sends a heartbeat message? I think that is
still OK, so I'm happy with these names for now, but if someone has a
better name, please speak up very soon.
(3)
#include <limits.h>
+#include <poll.h>
#ifndef WIN32
#include <sys/mman.h>
#endifpoll.h should be included between #ifndef WIN32 and #endif?
Oops, I forgot to wrap that in #ifdef HAVE_POLL_H while moving stuff
around. Fixed.
(4)
I think the new GUC works for walsender as well. If so, how do we explain the relationship between the new GUC and wal_receiver_timeout and recommend the settings of them?
No, it only works while executing a query. (Is there something in
logical decoding, perhaps, that I have failed to consider?)
PS The "from" headers in emails received from Fujitsu seems to have
the names stripped, somewhere in the tubes of the internet. I see the
full version when people from Fujitsu quote other people from Fujitsu.
I copied one of those into the commit message, complete with its
magnificent kanji characters (perhaps these are the cause of the
filtering?), and I hope that's OK with you.
Attachments:
v11-0001-Detect-POLLHUP-POLLRDHUP-while-running-queries.patchtext/x-patch; charset=UTF-8; name=v11-0001-Detect-POLLHUP-POLLRDHUP-while-running-queries.patchDownload
From 7c045e992469c88656655e2b460fb8622d7ac790 Mon Sep 17 00:00:00 2001
From: Maksim Milyutin <milyutinma@gmail.com>
Date: Fri, 26 Mar 2021 10:18:30 +0300
Subject: [PATCH v11] Detect POLLHUP/POLLRDHUP while running queries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Provide a new GUC check_client_connection_interval that can be used to
check whether the client connection has gone away, while running very
long queries. It is disabled by default.
For now this uses a non-standard Linux extension. POLLRDHUP is not
defined by POSIX, and other OSes don't have a reliable way to know if a
connection was shut down without actually trying to read or write. In
future we might extend this to other OSes by trying to send a
no-op/heartbeat message, but that may require protocol changes.
Author: Sergey Cherkashin <s.cherkashin@postgrespro.ru>
Author: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Tatsuo Ishii <ishii@sraoss.co.jp>
Reviewed-by: Konstantin Knizhnik <k.knizhnik@postgrespro.ru>
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Tsunakawa, Takayuki/綱川 貴之 <tsunakawa.takay@fujitsu.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (much earlier version)
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 37 +++++++++++++++++
src/backend/libpq/pqcomm.c | 40 +++++++++++++++++++
src/backend/tcop/postgres.c | 32 +++++++++++++++
src/backend/utils/init/globals.c | 1 +
src/backend/utils/init/postinit.c | 10 +++++
src/backend/utils/misc/guc.c | 29 ++++++++++++++
src/backend/utils/misc/postgresql.conf.sample | 3 ++
src/include/libpq/libpq.h | 1 +
src/include/miscadmin.h | 1 +
src/include/tcop/tcopprot.h | 1 +
src/include/utils/timeout.h | 1 +
11 files changed, 156 insertions(+)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 9d87b5097a..0c9128a55d 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -998,6 +998,43 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-client-connection-check-interval" xreflabel="client_connection_check_interval">
+ <term><varname>client_connection_check_interval</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>client_connection_check_interval</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Sets the time interval between optional checks that the client is still
+ connected, while running queries. The check is performed by polling
+ the socket, and allows long running queries to be aborted sooner if
+ the kernel reports that the connection is closed.
+ </para>
+ <para>
+ This option is currently available only on systems that support the
+ non-standard <symbol>POLLRDHUP</symbol> extension to the
+ <symbol>poll</symbol> system call, including Linux.
+ </para>
+ <para>
+ If the value is specified without units, it is taken as milliseconds.
+ The default value is <literal>0</literal>, which disables connection
+ checks. Without connection checks, the server will detect the loss of
+ the connection only at the next interaction with the socket, when it
+ waits for, receives or sends data.
+ </para>
+ <para>
+ For the kernel itself to detect lost TCP connections reliably and within
+ a known timeframe in all scenarios including network failure, it may
+ also be necessary to adjust the TCP keepalive settings of the operating
+ system, or the <xref linkend="guc-tcp-keepalives-idle"/>,
+ <xref linkend="guc-tcp-keepalives-interval"/> and
+ <xref linkend="guc-tcp-keepalives-count"/> settings of
+ <productname>PostgreSQL</productname>.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c7b1e7bfd..697c8c79af 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -54,6 +54,9 @@
*/
#include "postgres.h"
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
#include <signal.h>
#include <fcntl.h>
#include <grp.h>
@@ -1921,3 +1924,40 @@ pq_settcpusertimeout(int timeout, Port *port)
return STATUS_OK;
}
+
+/*
+ * Check if the kernel thinks the client is still connected.
+ */
+bool
+pq_check_connection(void)
+{
+#if defined(POLLRDHUP)
+ /*
+ * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
+ * the other end. We don't have a portable way to do that without
+ * actually trying to read or write data on other systems. We don't want
+ * to read because that would be confused by pipelined queries and COPY
+ * data. Perhaps in future we'll try to write a heartbeat message instead.
+ */
+ struct pollfd pollfd;
+ int rc;
+
+ pollfd.fd = MyProcPort->sock;
+ pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
+ pollfd.revents = 0;
+
+ rc = poll(&pollfd, 1, 0);
+
+ if (rc < 0)
+ {
+ ereport(COMMERROR,
+ (errcode_for_socket_access(),
+ errmsg("could not poll socket: %m")));
+ return false;
+ }
+ else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
+ return false;
+#endif
+
+ return true;
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2b1b68109f..ad351e2fd1 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -102,6 +102,9 @@ int max_stack_depth = 100;
/* wait N seconds to allow attach from a debugger */
int PostAuthDelay = 0;
+/* Time between checks that the client is still connected. */
+int client_connection_check_interval = 0;
+
/* ----------------
* private typedefs etc
* ----------------
@@ -2671,6 +2674,14 @@ start_xact_command(void)
* not desired, the timeout has to be disabled explicitly.
*/
enable_statement_timeout();
+
+ /* Start timeout for checking if the client has gone away if necessary. */
+ if (client_connection_check_interval > 0 &&
+ IsUnderPostmaster &&
+ MyProcPort &&
+ !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT))
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
}
static void
@@ -3149,6 +3160,27 @@ ProcessInterrupts(void)
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to administrator command")));
}
+
+ if (CheckClientConnectionPending)
+ {
+ CheckClientConnectionPending = false;
+
+ /*
+ * Check for lost connection and re-arm, if still configured, but not
+ * if we've arrived back at DoingCommandRead state. We don't want to
+ * wake up idle sessions, and they already know how to detect lost
+ * connections.
+ */
+ if (!DoingCommandRead && client_connection_check_interval > 0)
+ {
+ if (!pq_check_connection())
+ ClientConnectionLost = true;
+ else
+ enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT,
+ client_connection_check_interval);
+ }
+ }
+
if (ClientConnectionLost)
{
QueryCancelPending = false; /* lost connection trumps QueryCancel */
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 73e0a672ae..a9f0fc3017 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol;
volatile sig_atomic_t InterruptPending = false;
volatile sig_atomic_t QueryCancelPending = false;
volatile sig_atomic_t ProcDiePending = false;
+volatile sig_atomic_t CheckClientConnectionPending = false;
volatile sig_atomic_t ClientConnectionLost = false;
volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false;
volatile sig_atomic_t IdleSessionTimeoutPending = false;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7abeccb536..a3ec358538 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -73,6 +73,7 @@ static void StatementTimeoutHandler(void);
static void LockTimeoutHandler(void);
static void IdleInTransactionSessionTimeoutHandler(void);
static void IdleSessionTimeoutHandler(void);
+static void ClientCheckTimeoutHandler(void);
static bool ThereIsAtLeastOneRole(void);
static void process_startup_options(Port *port, bool am_superuser);
static void process_settings(Oid databaseid, Oid roleid);
@@ -620,6 +621,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IdleInTransactionSessionTimeoutHandler);
RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler);
+ RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler);
}
/*
@@ -1242,6 +1244,14 @@ IdleSessionTimeoutHandler(void)
SetLatch(MyLatch);
}
+static void
+ClientCheckTimeoutHandler(void)
+{
+ CheckClientConnectionPending = true;
+ InterruptPending = true;
+ SetLatch(MyLatch);
+}
+
/*
* Returns true if at least one role is defined in this database cluster.
*/
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 584daffc8a..60a9c7a2a0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -20,6 +20,9 @@
#include <float.h>
#include <math.h>
#include <limits.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
#ifndef WIN32
#include <sys/mman.h>
#endif
@@ -204,6 +207,7 @@ static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource sourc
static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_huge_page_size(int *newval, void **extra, GucSource source);
+static bool check_client_connection_check_interval(int *newval, void **extra, GucSource source);
static void assign_pgstat_temp_directory(const char *newval, void *extra);
static bool check_application_name(char **newval, void **extra, GucSource source);
static void assign_application_name(const char *newval, void *extra);
@@ -3501,6 +3505,17 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER,
+ gettext_noop("Sets the time interval between checks for disconnection while running queries."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &client_connection_check_interval,
+ 0, 0, INT_MAX,
+ check_client_connection_check_interval, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -11980,6 +11995,20 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
return true;
}
+static bool
+check_client_connection_check_interval(int *newval, void **extra, GucSource source)
+{
+#ifndef POLLRDHUP
+ /* Linux only, for now. See pq_check_connection(). */
+ if (*newval != 0)
+ {
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ return false;
+ }
+#endif
+ return true;
+}
+
static void
assign_pgstat_temp_directory(const char *newval, void *extra)
{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 30cfddac1f..39da7cc942 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -719,6 +719,9 @@
#dynamic_library_path = '$libdir'
+#client_connection_check_interval = 0 # time between checks for client
+ # disconnection while running queries;
+ # 0 for never
#------------------------------------------------------------------------------
# LOCK MANAGEMENT
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b20deeb555..3ebbc8d665 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -71,6 +71,7 @@ extern int pq_getbyte(void);
extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putmessage_v2(char msgtype, const char *s, size_t len);
+extern bool pq_check_connection(void);
/*
* prototypes for functions in be-secure.c
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 013850ac28..6f8251e0b0 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending;
extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending;
+extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending;
extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost;
/* these are marked volatile because they are examined by signal handlers: */
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index e5472100a4..241e7c9961 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -29,6 +29,7 @@ extern CommandDest whereToSendOutput;
extern PGDLLIMPORT const char *debug_query_string;
extern int max_stack_depth;
extern int PostAuthDelay;
+extern int client_connection_check_interval;
/* GUC-configurable parameters */
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ecb2a366a5..93e6a691b3 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -32,6 +32,7 @@ typedef enum TimeoutId
STANDBY_LOCK_TIMEOUT,
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
IDLE_SESSION_TIMEOUT,
+ CLIENT_CONNECTION_CHECK_TIMEOUT,
/* First user-definable timeout reason */
USER_TIMEOUT,
/* Maximum number of timeout reasons */
--
2.30.1
On Fri, Apr 2, 2021 at 1:36 AM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
Here's a minor comment: it would be good if we have an extra line
after variable assignments, before and after function calls/if
clauses, something like
Done in v11. Thanks.
From: Thomas Munro <thomas.munro@gmail.com>
Following PostmasterIsAlive(), maybe it's better to name it as
pq_connection_is_alive() pq_client_is_alive(), or pq_frontend_is_alive() like
the pqcomm.c's head comment uses the word frontend?I think it's OK, because it matches the name of the GUC. I'm more concerned
about the name of the GUC. Will we still be happy with this name if a future
releases sends a heartbeat message? I think that is still OK, so I'm happy with
these names for now, but if someone has a better name, please speak up very
soon.
OK, agreed.
(4)
I think the new GUC works for walsender as well. If so, how do we explainthe relationship between the new GUC and wal_receiver_timeout and
recommend the settings of them?No, it only works while executing a query. (Is there something in logical
decoding, perhaps, that I have failed to consider?)
When I saw the following code, I thought the new GUC takes effect at the same time as wal\sender_timeout. But they don't become effective simultaneously. The new GUC is effective before the logical replication starts. And wal_sender_timeout takes effect after the physical/logical replication starts. So, there's no problem.
[PostgresMain]
if (am_walsender)
{
if (!exec_replication_command(query_string))
exec_simple_query(query_string);
}
else
exec_simple_query(query_string);
The patch looks committable to me.
PS The "from" headers in emails received from Fujitsu seems to have the
names stripped, somewhere in the tubes of the internet. I see the full version
when people from Fujitsu quote other people from Fujitsu.
I copied one of those into the commit message, complete with its magnificent
kanji characters (perhaps these are the cause of the filtering?), and I hope
that's OK with you.
Certainly, it seems that only my email address appears in the sender field on the mailer. I'll check my Outlook settings.
Regards
Takayuki Tsunakawa
On Fri, Apr 2, 2021 at 6:18 PM tsunakawa.takay@fujitsu.com
<tsunakawa.takay@fujitsu.com> wrote:
The patch looks committable to me.
I checked for performance impact compared to master with pgbench -S,
and didn't see any problem. I thought more about how to write a
decent race-free test but struggled with the lack of a good way to
control multiple connections from TAP tests and gave up for now. I
previously tried to write something to help with that, but abandoned
it because it was terrible[1]/messages/by-id/CA+hUKG+FkUuDv-bcBns=Z_O-V9QGW0nWZNHOkEPxHZWjegRXvw@mail.gmail.com, see v2-0006-Add-TAP-test-for-snapshot-too-old.patch. It seems a bit strange to me that psql
has to be involved at all, and we don't have a simple way to connect
one or more sockets and speak the protocol from perl.
Pushed! Thanks to all who contributed.
[1]: /messages/by-id/CA+hUKG+FkUuDv-bcBns=Z_O-V9QGW0nWZNHOkEPxHZWjegRXvw@mail.gmail.com, see v2-0006-Add-TAP-test-for-snapshot-too-old.patch
see v2-0006-Add-TAP-test-for-snapshot-too-old.patch
On Sat, Apr 3, 2021 at 9:27 AM Thomas Munro <thomas.munro@gmail.com> wrote:
Pushed! Thanks to all who contributed.
Here's something I wanted to park here to look into for the next
cycle: it turns out that kqueue's EV_EOF flag also has the right
semantics for this. That leads to the idea of exposing the event via
the WaitEventSet API, and would the bring
client_connection_check_interval feature to 6/10 of our OSes, up from
2/10. Maybe Windows' FD_CLOSE event could get us up to 7/10, not
sure.
Attachments:
0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 957826a4c6bfec2d88c2ea2f004e55ebf12ea473 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH 1/2] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds, on systems that have the POLLRDHUP extension
* WAIT_USE_EPOLL builds, using EPOLLRDHUP
* WAIT_USE_KQUEUE builds, using EV_EOF
---
src/backend/storage/ipc/latch.c | 64 ++++++++++++++++++++++++++++-----
src/include/storage/latch.h | 5 +--
2 files changed, 58 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index ad781131e2..6c77356019 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..fd3581a99c 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
--
2.30.2
0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_int.patchtext/x-patch; charset=US-ASCII; name=0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_int.patchDownload
From c29ca0d9262e7eacf900cb83f9fd0e655f4cc261 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH 2/2] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use WaitEventSet to poll the socket for WL_SOCKET_CLOSED, which
knows how to detect that condition on more systems.
XXX Need to figure out a way to allow this feature only some builds,
but that information is currently private to latch.c
XXX May need to review the way eg POLLERR/POLLHUP and equivalents are
treated when you're waiting/polling for WL_SOCKET_CLOSED.
XXX Manually tested by killing psql, sending an async query with libpq
and then sending 'X' and hanging up, and by yanking an ethernet cable
(but the last requires TCP keepalives to be enabled). Need to find a
clever way to do at least the first two in a TAP (?) test without races.
---
src/backend/libpq/pqcomm.c | 35 +++++++++++++----------------------
src/backend/utils/misc/guc.c | 17 +----------------
2 files changed, 14 insertions(+), 38 deletions(-)
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index b9ccd4473f..1334dd4bb3 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -1932,33 +1932,24 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
+ WaitEvent events[3];
+ int rc;
+
/*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
+ * Temporarily ignore the latch, while we check if the socket has been
+ * closed by the other end (if that is possible on this OS).
*/
- struct pollfd pollfd;
- int rc;
-
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
-
- rc = poll(&pollfd, 1, 0);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
- if (rc < 0)
+ for (int i = 0; i < rc; ++i)
{
- ereport(COMMERROR,
- (errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
- return false;
+ if (events[i].pos == FeBeWaitSetSocketPos &&
+ events[i].events & WL_SOCKET_CLOSED)
+ return false;
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b130874bdc..60512ec82f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -209,7 +209,6 @@ static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource sourc
static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
static bool check_huge_page_size(int *newval, void **extra, GucSource source);
-static bool check_client_connection_check_interval(int *newval, void **extra, GucSource source);
static void assign_maintenance_io_concurrency(int newval, void *extra);
static void assign_pgstat_temp_directory(const char *newval, void *extra);
static bool check_application_name(char **newval, void **extra, GucSource source);
@@ -3580,7 +3579,7 @@ static struct config_int ConfigureNamesInt[] =
},
&client_connection_check_interval,
0, 0, INT_MAX,
- check_client_connection_check_interval, NULL, NULL
+ NULL, NULL, NULL
},
/* End-of-list marker */
@@ -12094,20 +12093,6 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
return true;
}
-static bool
-check_client_connection_check_interval(int *newval, void **extra, GucSource source)
-{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
- {
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
- return false;
- }
-#endif
- return true;
-}
-
static void
assign_maintenance_io_concurrency(int newval, void *extra)
{
--
2.30.2
On Fri, Apr 30, 2021 at 2:23 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Here's something I wanted to park here to look into for the next
cycle: it turns out that kqueue's EV_EOF flag also has the right
semantics for this. That leads to the idea of exposing the event via
the WaitEventSet API, and would the bring
client_connection_check_interval feature to 6/10 of our OSes, up from
2/10. Maybe Windows' FD_CLOSE event could get us up to 7/10, not
sure.
Rebased. Added documentation tweak and a check to reject the GUC on
unsupported OSes.
Attachments:
v2-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v2-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 356b0dcbc15353b9bd349972c80a7f2e5c516a0e Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v2 1/2] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds, using (non-standard) POLLRDHUP
* WAIT_USE_EPOLL builds, using EPOLLRDHUP
* WAIT_USE_KQUEUE builds, using EV_EOF
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 1d893cf863..54e928c564 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..d78ff0bede 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.30.2
v2-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v2-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From c37dc98cea2983734decc37cadc2f3a19edd1e85 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v2 2/2] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use WaitEventSet to poll the socket for WL_SOCKET_CLOSED, which
knows how to detect equivalent events on many more operating systems.
XXX Manually tested by killing psql, sending an async query with libpq
and then sending 'X' and hanging up, and by yanking an ethernet cable
(but the last requires TCP keepalives to be enabled). Need to find a
clever way to do at least the first two in a TAP (?) test without races.
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 5 ++---
src/backend/libpq/pqcomm.c | 35 +++++++++++++----------------------
src/backend/utils/misc/guc.c | 7 ++-----
3 files changed, 17 insertions(+), 30 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index aa3e178240..c94e0ed9ef 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,8 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, BSD-family,
+ macOS and illumos, and is not available on other operating systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 89a5f901aa..5660ece4e3 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -1932,33 +1932,24 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
+ WaitEvent events[3];
+ int rc;
+
/*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
+ * Temporarily ignore the latch, while we check if the socket has been
+ * closed by the other end (if that is possible on this OS).
*/
- struct pollfd pollfd;
- int rc;
-
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
-
- rc = poll(&pollfd, 1, 0);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
- if (rc < 0)
+ for (int i = 0; i < rc; ++i)
{
- ereport(COMMERROR,
- (errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
- return false;
+ if (events[i].pos == FeBeWaitSetSocketPos &&
+ events[i].events & WL_SOCKET_CLOSED)
+ return false;
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 68b62d523d..465b5ec1b6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12063,14 +12063,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
--
2.30.2
On Fri, Jun 11, 2021 at 9:24 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Fri, Apr 30, 2021 at 2:23 PM Thomas Munro <thomas.munro@gmail.com>
wrote:Here's something I wanted to park here to look into for the next
cycle: it turns out that kqueue's EV_EOF flag also has the right
semantics for this. That leads to the idea of exposing the event via
the WaitEventSet API, and would the bring
client_connection_check_interval feature to 6/10 of our OSes, up from
2/10. Maybe Windows' FD_CLOSE event could get us up to 7/10, not
sure.Rebased. Added documentation tweak and a check to reject the GUC on
unsupported OSes.
Hi,
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
It seems that the remaining Assert() should say 1 <= count && count <= 2
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
It seems the last condition above should be written as:
((cur_pollfd->revents & POLLRDHUP) | (cur_pollfd->revents & errflags))
Cheers
On Sat, Jun 12, 2021 at 8:31 PM Zhihong Yu <zyu@yugabyte.com> wrote:
+#ifdef POLLRDHUP + if ((cur_event->events & WL_SOCKET_CLOSED) && + (cur_pollfd->revents & (POLLRDHUP | errflags)))It seems the last condition above should be written as:
((cur_pollfd->revents & POLLRDHUP) | (cur_pollfd->revents & errflags))
Hi Zhihong,
Why? Isn't (A & B) | (A & C) is the same as A & (B | C)?
On Thu, Oct 7, 2021 at 8:43 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Sat, Jun 12, 2021 at 8:31 PM Zhihong Yu <zyu@yugabyte.com> wrote:
+#ifdef POLLRDHUP + if ((cur_event->events & WL_SOCKET_CLOSED) && + (cur_pollfd->revents & (POLLRDHUP | errflags)))It seems the last condition above should be written as:
((cur_pollfd->revents & POLLRDHUP) | (cur_pollfd->revents & errflags))
Hi Zhihong,
Why? Isn't (A & B) | (A & C) is the same as A & (B | C)?
Hi,
My former comment was about 4 months old.
The current way as expressed in the patch should be fine.
Cheers
On 12.06.2021 07:24, Thomas Munro wrote:
On Fri, Apr 30, 2021 at 2:23 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Here's something I wanted to park here to look into for the next
cycle: it turns out that kqueue's EV_EOF flag also has the right
semantics for this. That leads to the idea of exposing the event via
the WaitEventSet API, and would the bring
client_connection_check_interval feature to 6/10 of our OSes, up from
2/10. Maybe Windows' FD_CLOSE event could get us up to 7/10, not
sure.Rebased. Added documentation tweak and a check to reject the GUC on
unsupported OSes.
Good work. I have tested your patch on Linux and FreeBSD on three basic
cases: client killing, cable breakdown (via manipulations with firewall)
and silent closing client connection before completion of previously
started query in asynchronous manner. And all works fine.
Some comments from me:
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
+ WaitEvent events[3];
3 is looks like as magic constant. We might to specify a constant for
all event groups in FeBeWaitSet.
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos,
WL_SOCKET_CLOSED, NULL);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET,
MyLatch);
AFAICS, side effect to
(FeBeWaitSet->events[FeBeWaitSetSocketPos]).events by setting
WL_SOCKET_CLOSED value under calling of pq_check_connection() function
doesn't have negative impact later, does it? That is, all
WaitEventSetWait() calls have to setup socket events on its own from
scratch.
--
Regards,
Maksim Milyutin
On Tue, Oct 12, 2021 at 3:10 AM Maksim Milyutin <milyutinma@gmail.com> wrote:
Good work. I have tested your patch on Linux and FreeBSD on three basic
cases: client killing, cable breakdown (via manipulations with firewall)
and silent closing client connection before completion of previously
started query in asynchronous manner. And all works fine.
Thanks for the testing and review!
+ WaitEvent events[3];
3 is looks like as magic constant. We might to specify a constant for
all event groups in FeBeWaitSet.
Yeah. In fact, we really just need one event. Fixed.
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL); + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL); + rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0); + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);AFAICS, side effect to
(FeBeWaitSet->events[FeBeWaitSetSocketPos]).events by setting
WL_SOCKET_CLOSED value under calling of pq_check_connection() function
doesn't have negative impact later, does it? That is, all
WaitEventSetWait() calls have to setup socket events on its own from
scratch.
Correct: every site that waits for FeBeWaitSet explicitly modifies the
socket event to say what it's waiting for (and that is often a no-op
internally), so we don't have to worry about restoring the previous
state. I've added a comment about that. We should work harder to
restore the latch than my previous patch did, though. Now I'm using a
PG_FINALLY() block.
I'm hoping to push this soon, after another round of testing, if
there's no more feedback.
There is one more OS that could be added, but I'll leave it out of the
initial commit, pending further investigation. Since I recently had
to set up a Windows dev VM up to test some other portability stuff, I
had a chance to test the FD_CLOSE hypothesis. You just have to do
this to enable it:
@@ -2069,6 +2069,7 @@ WaitEventSetCanReportClosed(void)
{
#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_WIN32) || \
defined(WAIT_USE_KQUEUE)
return true;
#else
It seems to work! I'm not sure why it works, or whether we can count
on it, though. These sentences from the documentation[1]https://docs.microsoft.com/en-us/windows/win32/api/winsock2/nf-winsock2-wsaeventselect seem to
contract each other:
"FD_CLOSE being posted after all data is read from a socket. An
application should check for remaining data upon receipt of FD_CLOSE
to avoid any possibility of losing data."
My test says that the first sentence is wrong, but the second doesn't
exactly say that it has reliable POLLRDHUP nature, and I haven't found
one that does, yet. Perhaps we can convince ourselves of that in
follow-up work.
For the record, I tested two scenarios. The client was a Unix system,
the server a Windows 10 VM.
1. Connecting with psql and running "SELECT pg_sleep(60)" and then
killing the psql process. I'm not surprised that this one worked; it
would work if we tested for WL_SOCKET_READABLE too, but we already
decided that's not good enough.
2. Connecting from a C program that does PQsendQuery(conn, "SELECT
pg_sleep(60)") and then immediately PQfinish(conn), to test whether
the FD_CLOSE event is reported even though there is an unconsumed 'X'
in the socket. I wouldn't want to ship the feature on an OS where
this case doesn't get reported, or doesn't get reported sometimes,
because it'd be unreliable and unlike the behaviour on other OSes.
But it worked for me.
[1]: https://docs.microsoft.com/en-us/windows/win32/api/winsock2/nf-winsock2-wsaeventselect
Attachments:
v3-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v3-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 842d96842487881b05ed3d806caa14e84c8be4f9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v3 1/2] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds, if the POLLRDHUP extension is available
* WAIT_USE_EPOLL builds, using EPOLLRDHUP
* WAIT_USE_KQUEUE builds, using EV_EOF
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 1d893cf863..54e928c564 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..d78ff0bede 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.33.1
v3-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v3-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From 2d191c0d3bbc240e125633d51f9f1e003630b866 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v3 2/2] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use WaitEventSet to poll the socket for WL_SOCKET_CLOSED, which
knows how to detect equivalent events on many more operating systems.
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 6 ++---
src/backend/libpq/pqcomm.c | 44 ++++++++++++++++++++++++------------
src/backend/utils/misc/guc.c | 7 ++----
3 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4ac617615c..ab52d4bd2c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,9 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, macOS, illumos
+ and the BSD family of operating systems, and is not currently available
+ on other systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c37df09cf..c235a682c2 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -1959,22 +1959,36 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
- /*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
- */
- struct pollfd pollfd;
+ WaitEvent event;
int rc;
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
+ /*
+ * Temporarily ignore the latch, so that we can poll for just the one
+ * event we care about.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);
- rc = poll(&pollfd, 1, 0);
+ PG_TRY();
+ {
+ /*
+ * It's OK to clobber the socket event to report only the event we're
+ * interested in without restoring the previous state afterwards,
+ * because every FeBeWaitSet wait site does the same.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED,
+ NULL);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, &event, 1, 0);
+ }
+ PG_FINALLY();
+ {
+ /*
+ * Restore the latch, so we can't leave FeBeWaitSet in a broken state
+ * that ignores latches.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET,
+ MyLatch);
+ }
+ PG_END_TRY();
if (rc < 0)
{
@@ -1983,9 +1997,9 @@ pq_check_connection(void)
errmsg("could not poll socket: %m")));
return false;
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
+
+ if (rc == 1 && (event.events & WL_SOCKET_CLOSED))
return false;
-#endif
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ee6a838b3a..ca45b36754 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12169,14 +12169,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
--
2.33.1
Hi,
On 2021-12-11 17:41:34 +1300, Thomas Munro wrote:
--- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -1959,22 +1959,36 @@ pq_settcpusertimeout(int timeout, Port *port) bool pq_check_connection(void) { -#if defined(POLLRDHUP) - /* - * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by - * the other end. We don't have a portable way to do that without - * actually trying to read or write data on other systems. We don't want - * to read because that would be confused by pipelined queries and COPY - * data. Perhaps in future we'll try to write a heartbeat message instead. - */ - struct pollfd pollfd; + WaitEvent event; int rc;- pollfd.fd = MyProcPort->sock; - pollfd.events = POLLOUT | POLLIN | POLLRDHUP; - pollfd.revents = 0; + /* + * Temporarily ignore the latch, so that we can poll for just the one + * event we care about. + */ + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);- rc = poll(&pollfd, 1, 0); + PG_TRY(); + { + /* + * It's OK to clobber the socket event to report only the event we're + * interested in without restoring the previous state afterwards, + * because every FeBeWaitSet wait site does the same. + */ + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, + NULL); + rc = WaitEventSetWait(FeBeWaitSet, 0, &event, 1, 0); + } + PG_FINALLY(); + { + /* + * Restore the latch, so we can't leave FeBeWaitSet in a broken state + * that ignores latches. + */ + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, + MyLatch); + } + PG_END_TRY();
Yuck. Is there really no better way to deal with this? What kind of errors is
this trying to handle transparently? Afaics this still changes when we'd
e.g. detect postmaster death.
Am I misunderstanding code or comment, or is the comment saying that it's ok
to clobber the wes, but then we actually unclobber it?
Greetings,
Andres Freund
On Sat, Dec 11, 2021 at 6:11 PM Andres Freund <andres@anarazel.de> wrote:
Yuck. Is there really no better way to deal with this? What kind of errors is
this trying to handle transparently? Afaics this still changes when we'd
e.g. detect postmaster death.
The problem is that WaitEventSetWait() only reports the latch, if it's
set, so I removed it from the set (setting it to NULL), and then undo
that afterwards. Perhaps we could fix that root problem instead.
That is, we could make it so that latches aren't higher priority in
that way, ie don't hide other events[1]As described in WaitEventSetWait():. Then I wouldn't have to
modify the WES here, I could just ignore it in the output event list
(and make sure that's big enough for all possible events, as I had it
in the last version). I'll think about that.
Am I misunderstanding code or comment, or is the comment saying that it's ok
to clobber the wes, but then we actually unclobber it?
It's explaining that it's OK to clobber the *socket*, but that the
*latch* needs to be unclobbered.
[1]: As described in WaitEventSetWait():
* Check if the latch is set already. If so, leave the loop
* immediately, avoid blocking again. We don't attempt to report any
* other events that might also be satisfied.
On Sat, Dec 11, 2021 at 7:09 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Sat, Dec 11, 2021 at 6:11 PM Andres Freund <andres@anarazel.de> wrote:
Yuck. Is there really no better way to deal with this? What kind of errors is
this trying to handle transparently? Afaics this still changes when we'd
e.g. detect postmaster death.The problem is that WaitEventSetWait() only reports the latch, if it's
set, so I removed it from the set (setting it to NULL), and then undo
that afterwards. Perhaps we could fix that root problem instead.
That is, we could make it so that latches aren't higher priority in
that way, ie don't hide other events[1]. Then I wouldn't have to
modify the WES here, I could just ignore it in the output event list
(and make sure that's big enough for all possible events, as I had it
in the last version). I'll think about that.
I tried that. It seems OK, and gets rid of the PG_FINALLY(), which is
nice. Latches still have higher priority, and still have the fast
return if already set and you asked for only one event, but now if you
ask for nevents > 1 we'll make the syscall too so we'll see the
WL_SOCKET_CLOSED.
It's possible that someone might want the old behaviour one day (fast
return even for nevents > 1, hiding other events), and then we'd have
to come up with some way to request that, which is the type of tricky
policy question that had put me off "fixing" this before. But I guess
we could cross that bridge if we come to it. Everywhere else calls
with nevents == 1, so that's hypothetical.
Better?
Attachments:
v4-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 6b994807c29157ac7053ec347a51268d60f2b3b4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v4 1/3] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds using POLLRDHUP, if available
* WAIT_USE_EPOLL builds using EPOLLRDHUP
* WAIT_USE_KQUEUE builds using EV_EOF
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 1d893cf863..54e928c564 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..d78ff0bede 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.30.2
v4-0002-Don-t-let-latches-hide-other-WaitEventSet-events.patchtext/x-patch; charset=US-ASCII; name=v4-0002-Don-t-let-latches-hide-other-WaitEventSet-events.patchDownload
From 4edb07dbe87911de2ef0c30e57b766fcad022dff Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 13 Dec 2021 14:31:18 +1300
Subject: [PATCH v4 2/3] Don't let latches hide other WaitEventSet events.
Previously, if WaitEventSet() saw that a latch had been set in memory,
it would return an event immediately, hiding any other events that might
also be pending in the kernel.
With this change, latches still have higher priority than other events,
but if you ask for 3 events, you'll get up to 3 events that are
currently pending, even if the latch is set. This makes no difference
to any existing caller, because they all wait for just one event, so
they never reach the slow enter-the-kernel case.
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 43 +++++++++++++++++++++++----------
1 file changed, 30 insertions(+), 13 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 54e928c564..30f7160c61 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -1354,9 +1354,9 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
int rc;
/*
- * Check if the latch is set already. If so, leave the loop
- * immediately, avoid blocking again. We don't attempt to report any
- * other events that might also be satisfied.
+ * Check if the latch is set already first. We don't have to enter the
+ * kernel if it's already set, unless the caller wants more than one
+ * event.
*
* If someone sets the latch between this and the
* WaitEventSetWaitBlock() below, the setter will write a byte to the
@@ -1401,7 +1401,26 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
/* could have been set above */
set->latch->maybe_sleeping = false;
- break;
+ if (returned_events == nevents)
+ {
+ /*
+ * Already fully satisfied by this latch event, so there's no
+ * need to enter the kernel.
+ */
+ break;
+ }
+ else
+ {
+ /*
+ * The caller wants to poll other events too. Set timeout to
+ * zero, because we already have an event to report to the
+ * caller. WaitEventSetWaitBlock() won't double-report the
+ * latch, because we cleared latch->maybe_sleeping.
+ */
+ Assert(returned_events == 1);
+ Assert(nevents > 1);
+ cur_timeout = 0;
+ }
}
/*
@@ -1410,18 +1429,16 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
* to retry, everything >= 1 is the number of returned events.
*/
rc = WaitEventSetWaitBlock(set, cur_timeout,
- occurred_events, nevents);
+ occurred_events,
+ nevents - returned_events);
if (set->latch)
- {
- Assert(set->latch->maybe_sleeping);
set->latch->maybe_sleeping = false;
- }
if (rc == -1)
break; /* timeout occurred */
else
- returned_events = rc;
+ returned_events += rc;
/* If we're not done, update cur_timeout for next iteration */
if (returned_events == 0 && timeout >= 0)
@@ -1509,7 +1526,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
/* Drain the signalfd. */
drain();
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1667,7 +1684,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
if (cur_event->events == WL_LATCH_SET &&
cur_kqueue_event->filter == EVFILT_SIGNAL)
{
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1792,7 +1809,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
/* There's data in the self-pipe, clear it. */
drain();
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1974,7 +1991,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
if (!ResetEvent(set->handles[cur_event->pos + 1]))
elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
--
2.30.2
v4-0003-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v4-0003-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From 6920b741edc3e9bb5f0a573941fbc055b8d7a434 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v4 3/3] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use the WaitEventSet API to poll the socket for
WL_SOCKET_CLOSED, which knows how to detect this condition on many more
operating systems.
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 6 +++---
src/backend/libpq/pqcomm.c | 33 +++++++++++++++------------------
src/backend/utils/misc/guc.c | 7 ++-----
src/include/libpq/libpq.h | 1 +
4 files changed, 21 insertions(+), 26 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4ac617615c..ab52d4bd2c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,9 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, macOS, illumos
+ and the BSD family of operating systems, and is not currently available
+ on other systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c37df09cf..d99ab09973 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -204,7 +204,7 @@ pq_init(void)
(errmsg("could not set socket to nonblocking mode: %m")));
#endif
- FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+ FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents);
socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
MyProcPort->sock, NULL, NULL);
latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
@@ -1959,33 +1959,30 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
- /*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
- */
- struct pollfd pollfd;
+ WaitEvent events[FeBeWaitSetNEvents];
int rc;
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
+ /*
+ * It's OK to modify the socket event filter without restoring, because all
+ * socket wait sites do the same.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
- rc = poll(&pollfd, 1, 0);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
if (rc < 0)
{
ereport(COMMERROR,
(errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
+ errmsg("could not check for closed socket: %m")));
return false;
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
+
+ for (int i = 0; i < rc; ++i)
+ {
+ if (events[i].events & WL_SOCKET_CLOSED)
+ return false;
+ }
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ee6a838b3a..ca45b36754 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12169,14 +12169,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 6b67a2a318..9f168a284f 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -62,6 +62,7 @@ extern WaitEventSet *FeBeWaitSet;
#define FeBeWaitSetSocketPos 0
#define FeBeWaitSetLatchPos 1
+#define FeBeWaitSetNEvents 3
extern int StreamServerPort(int family, const char *hostName,
unsigned short portNumber, const char *unixSocketDir,
--
2.30.2
On Mon, Dec 13, 2021 at 5:51 PM Thomas Munro <thomas.munro@gmail.com> wrote:
[...] Everywhere else calls
with nevents == 1, so that's hypothetical.
Erm, I forgot about ExecAppendAsyncEventWait(), so I'd have to update
the commit message on that point, but it's hard to worry too much
about that case -- it's creating and destroying a WES every time. I'd
rather worry about fixing that problem.
Hi,
On 2021-12-13 17:51:00 +1300, Thomas Munro wrote:
On Sat, Dec 11, 2021 at 7:09 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Sat, Dec 11, 2021 at 6:11 PM Andres Freund <andres@anarazel.de> wrote:
Yuck. Is there really no better way to deal with this? What kind of errors is
this trying to handle transparently? Afaics this still changes when we'd
e.g. detect postmaster death.The problem is that WaitEventSetWait() only reports the latch, if it's
set, so I removed it from the set (setting it to NULL), and then undo
that afterwards. Perhaps we could fix that root problem instead.
That is, we could make it so that latches aren't higher priority in
that way, ie don't hide other events[1]. Then I wouldn't have to
modify the WES here, I could just ignore it in the output event list
(and make sure that's big enough for all possible events, as I had it
in the last version). I'll think about that.I tried that. It seems OK, and gets rid of the PG_FINALLY(), which is
nice. Latches still have higher priority, and still have the fast
return if already set and you asked for only one event, but now if you
ask for nevents > 1 we'll make the syscall too so we'll see the
WL_SOCKET_CLOSED.
Isn't a certain postgres committer that cares a lot about unnecessary syscalls
going to be upset about this one? Even with the nevents > 1 optimization? Yes,
right now there's no other paths that do so, but I don't like the corner this
paints us in.
From a different angle: Why do we need to perform the client connection check
if the latch is set?
Greetings,
Andres Freund
On Tue, Dec 14, 2021 at 7:53 AM Andres Freund <andres@anarazel.de> wrote:
On 2021-12-13 17:51:00 +1300, Thomas Munro wrote:
I tried that. It seems OK, and gets rid of the PG_FINALLY(), which is
nice. Latches still have higher priority, and still have the fast
return if already set and you asked for only one event, but now if you
ask for nevents > 1 we'll make the syscall too so we'll see the
WL_SOCKET_CLOSED.Isn't a certain postgres committer that cares a lot about unnecessary syscalls
going to be upset about this one? Even with the nevents > 1 optimization? Yes,
right now there's no other paths that do so, but I don't like the corner this
paints us in.
Well, I was trying to avoid bikeshedding an API change just for a
hypothetical problem we could solve when the time comes (say, after
fixing the more egregious problems with Append's WES usage), but here
goes: we could do something like AddWaitEventToSet(FeBeWaitSet,
WL_LATCH_SET_LOPRIO, ...) that is translated to WL_LATCH_SET
internally but also sets a flag to enable this
no-really-please-poll-all-the-things-if-there-is-space behaviour.
From a different angle: Why do we need to perform the client connection check
if the latch is set?
Imagine a parallel message that arrives just as our connection check
CFI routine runs, and sets the latch. It'd be arbitrary and bizarre
if that caused us to skip the check. So we have to ignore it, and the
question is just how. I presented two different ways. A third way
would be to create an entirely new WES for this use case; nope, that's
either wasteful of an fd or wasteful of system calls for a temporary
WES and likely more PG_TRY() stuff to avoid leaking it. A fourth
could be to modify the WES like the simple code in v2/v3, but make the
WES code no-throw or pretend it's no-throw, which I didn't seriously
consider (I mean, if, say, WaitForMultipleObjects() returns
WAIT_FAILED and ERRORs then your session is pretty much hosed and your
WES is probably never going to work correctly again, but it still
seems to break basic rules of programming decency and exception safety
to leave the WES sans latch on non-local exit, which is why I added
the PG_FINALLY() that offended you to v3).
On Tue, Dec 14, 2021 at 11:18 AM Thomas Munro <thomas.munro@gmail.com> wrote:
On Tue, Dec 14, 2021 at 7:53 AM Andres Freund <andres@anarazel.de> wrote:
On 2021-12-13 17:51:00 +1300, Thomas Munro wrote:
I tried that. It seems OK, and gets rid of the PG_FINALLY(), which is
nice. Latches still have higher priority, and still have the fast
return if already set and you asked for only one event, but now if you
ask for nevents > 1 we'll make the syscall too so we'll see the
WL_SOCKET_CLOSED.Isn't a certain postgres committer that cares a lot about unnecessary syscalls
going to be upset about this one? Even with the nevents > 1 optimization? Yes,
right now there's no other paths that do so, but I don't like the corner this
paints us in.Well, I was trying to avoid bikeshedding an API change just for a
hypothetical problem we could solve when the time comes (say, after
fixing the more egregious problems with Append's WES usage), but here
goes: we could do something like AddWaitEventToSet(FeBeWaitSet,
WL_LATCH_SET_LOPRIO, ...) that is translated to WL_LATCH_SET
internally but also sets a flag to enable this
no-really-please-poll-all-the-things-if-there-is-space behaviour.
Here's one like that.
Attachments:
v5-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v5-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 6b994807c29157ac7053ec347a51268d60f2b3b4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v5 1/3] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds using POLLRDHUP, if available
* WAIT_USE_EPOLL builds using EPOLLRDHUP
* WAIT_USE_KQUEUE builds using EV_EOF
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 1d893cf863..54e928c564 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..d78ff0bede 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.33.1
v5-0002-Add-support-for-lower-priority-latches.patchtext/x-patch; charset=US-ASCII; name=v5-0002-Add-support-for-lower-priority-latches.patchDownload
From 05cc76fbbd4fd6bd7b154c4ab732df4e755117d1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 13 Dec 2021 14:31:18 +1300
Subject: [PATCH v5 2/3] Add support for lower priority latches.
Normally, latch events that can be detected by reading shared memory are
reported immediately, without entering the kernel to poll for any other
conditions that might also be reportable.
Add a new variant of WL_LATCH_SET called WL_LATCH_SET_LOPRIO that
disables this high priority treatment, so that you can ask for all events
that are currently pending, even if a latch would normally hide any
others.
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 55 +++++++++++++++++++++++++--------
src/include/storage/latch.h | 1 +
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 54e928c564..8850ef740d 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -102,9 +102,13 @@ struct WaitEventSet
* said latch, and latch_pos the offset in the ->events array. This is
* useful because we check the state of the latch before performing doing
* syscalls related to waiting.
+ *
+ * WL_LATCH_SET_LOPRIO is converted to WL_LATCH_SET and latch_loprio is
+ * set.
*/
Latch *latch;
int latch_pos;
+ bool latch_loprio;
/*
* WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
@@ -833,6 +837,7 @@ FreeWaitEventSet(WaitEventSet *set)
/* ---
* Add an event to the set. Possible events are:
* - WL_LATCH_SET: Wait for the latch to be set
+ * - WL_LATCH_SET_LOPRIO: Wait for the latch to be set, lower priority
* - WL_POSTMASTER_DEATH: Wait for postmaster to die
* - WL_SOCKET_READABLE: Wait for socket to become readable,
* can be combined in one event with other WL_SOCKET_* events
@@ -868,6 +873,12 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
/* not enough space */
Assert(set->nevents < set->nevents_space);
+ if (events == WL_LATCH_SET_LOPRIO)
+ {
+ events = WL_LATCH_SET;
+ set->latch_loprio = true;
+ }
+
if (events == WL_EXIT_ON_PM_DEATH)
{
events = WL_POSTMASTER_DEATH;
@@ -1354,9 +1365,8 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
int rc;
/*
- * Check if the latch is set already. If so, leave the loop
- * immediately, avoid blocking again. We don't attempt to report any
- * other events that might also be satisfied.
+ * Check if the latch is set already first. We may be able to exit the
+ * loop and avoid a system call.
*
* If someone sets the latch between this and the
* WaitEventSetWaitBlock() below, the setter will write a byte to the
@@ -1401,7 +1411,28 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
/* could have been set above */
set->latch->maybe_sleeping = false;
- break;
+ if (!set->latch_loprio || returned_events == nevents)
+ {
+ /*
+ * Latch events normally have higher priority than other
+ * events: we report them immediately, rather than entering the
+ * kernel to look for more events that might be pending.
+ */
+ break;
+ }
+ else
+ {
+ /*
+ * The caller opted for lower priority latches, and there is
+ * space to poll for more events. Set timeout to zero, because
+ * we already have an event to report to the caller.
+ * WaitEventSetWaitBlock() won't double-report the latch,
+ * because we cleared latch->maybe_sleeping.
+ */
+ Assert(returned_events == 1);
+ Assert(nevents > 1);
+ cur_timeout = 0;
+ }
}
/*
@@ -1410,18 +1441,16 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
* to retry, everything >= 1 is the number of returned events.
*/
rc = WaitEventSetWaitBlock(set, cur_timeout,
- occurred_events, nevents);
+ occurred_events,
+ nevents - returned_events);
if (set->latch)
- {
- Assert(set->latch->maybe_sleeping);
set->latch->maybe_sleeping = false;
- }
if (rc == -1)
break; /* timeout occurred */
else
- returned_events = rc;
+ returned_events += rc;
/* If we're not done, update cur_timeout for next iteration */
if (returned_events == 0 && timeout >= 0)
@@ -1509,7 +1538,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
/* Drain the signalfd. */
drain();
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1667,7 +1696,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
if (cur_event->events == WL_LATCH_SET &&
cur_kqueue_event->filter == EVFILT_SIGNAL)
{
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1792,7 +1821,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
/* There's data in the self-pipe, clear it. */
drain();
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
@@ -1974,7 +2003,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
if (!ResetEvent(set->handles[cur_event->pos + 1]))
elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
- if (set->latch && set->latch->is_set)
+ if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
{
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index d78ff0bede..4da655d052 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -139,6 +139,7 @@ typedef struct Latch
WL_SOCKET_WRITEABLE | \
WL_SOCKET_CONNECTED | \
WL_SOCKET_CLOSED)
+#define WL_LATCH_SET_LOPRIO (1 << 8)
typedef struct WaitEvent
{
--
2.33.1
v5-0003-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v5-0003-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From 1aff9925294bdc50c4e87f9e38489dc1a9b8df97 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v5 3/3] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use the WaitEventSet API to poll the socket for
WL_SOCKET_CLOSED, which knows how to detect this condition on many more
operating systems.
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 6 +++---
src/backend/libpq/pqcomm.c | 42 +++++++++++++++++++-----------------
src/backend/utils/misc/guc.c | 7 ++----
src/include/libpq/libpq.h | 1 +
4 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4ac617615c..ab52d4bd2c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,9 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, macOS, illumos
+ and the BSD family of operating systems, and is not currently available
+ on other systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c37df09cf..4bbc8b89ff 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -204,11 +204,16 @@ pq_init(void)
(errmsg("could not set socket to nonblocking mode: %m")));
#endif
- FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+ /*
+ * We'll use a low priority latch event. This allows pq_check_connection()
+ * to poll for other events without worrying about incidental latch events
+ * hiding them.
+ */
+ FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents);
socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
MyProcPort->sock, NULL, NULL);
- latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
- MyLatch, NULL);
+ latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET_LOPRIO,
+ PGINVALID_SOCKET, MyLatch, NULL);
AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
NULL, NULL);
@@ -1959,33 +1964,30 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
- /*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
- */
- struct pollfd pollfd;
+ WaitEvent events[FeBeWaitSetNEvents];
int rc;
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
+ /*
+ * It's OK to modify the socket event filter without restoring, because all
+ * socket wait sites do the same.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
- rc = poll(&pollfd, 1, 0);
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
if (rc < 0)
{
ereport(COMMERROR,
(errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
+ errmsg("could not check for closed socket: %m")));
return false;
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
+
+ for (int i = 0; i < rc; ++i)
+ {
+ if (events[i].events & WL_SOCKET_CLOSED)
+ return false;
+ }
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ee6a838b3a..ca45b36754 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12169,14 +12169,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 6b67a2a318..9f168a284f 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -62,6 +62,7 @@ extern WaitEventSet *FeBeWaitSet;
#define FeBeWaitSetSocketPos 0
#define FeBeWaitSetLatchPos 1
+#define FeBeWaitSetNEvents 3
extern int StreamServerPort(int family, const char *hostName,
unsigned short portNumber, const char *unixSocketDir,
--
2.33.1
On Tue, Dec 14, 2021 at 11:50 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Tue, Dec 14, 2021 at 11:18 AM Thomas Munro <thomas.munro@gmail.com> wrote:
Well, I was trying to avoid bikeshedding an API change just for a
hypothetical problem we could solve when the time comes (say, after
fixing the more egregious problems with Append's WES usage), but here
goes: we could do something like AddWaitEventToSet(FeBeWaitSet,
WL_LATCH_SET_LOPRIO, ...) that is translated to WL_LATCH_SET
internally but also sets a flag to enable this
no-really-please-poll-all-the-things-if-there-is-space behaviour.
That API is probably useless for anything else and is just too
complicated for what it's doing here.
I considered another idea we discussed: if we see a latch event, clear
it and try again so that other events can be revealed (rince and
repeat), but remember if that happens and set the latch at the end. I
think that still requires PG_FINALLY() if you want to guarantee not to
eat a latch event if WaitEventSetWait() throws. This may be a
theoretical point because things must be pretty broken if
WaitEventSetWait() is throwing, but I don't like an egregious lack of
exception safety on principle.
So I think I had it better in the beginning: just mute the latch, and
then unmute it at the end in a PG_FINALLY() block. I'm back to
proposing that short and sweet version, this time with some minor
cleanup.
Attachments:
v6-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v6-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From 00d5c758588c6d0c23736898e5dc6aedc79c9315 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v6 1/2] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds using POLLRDHUP, if available
* WAIT_USE_EPOLL builds using EPOLLRDHUP
* WAIT_USE_KQUEUE builds using EV_EOF
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 1d893cf863..54e928c564 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 44f9368c64..d78ff0bede 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.33.1
v6-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v6-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From 494c949b3a9fe9704bb99689a3dc375fa5156f16 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v6 2/2] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use the WaitEventSet API to poll the socket for
WL_SOCKET_CLOSED, which knows how to detect this condition on many more
operating systems.
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 6 ++--
src/backend/libpq/pqcomm.c | 54 +++++++++++++++++++++---------------
src/backend/utils/misc/guc.c | 7 ++---
src/include/libpq/libpq.h | 1 +
4 files changed, 37 insertions(+), 31 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index afbb6c35e3..4a6a869460 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,9 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, macOS, illumos
+ and the BSD family of operating systems, and is not currently available
+ on other systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 4c37df09cf..3b701998b7 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -204,7 +204,7 @@ pq_init(void)
(errmsg("could not set socket to nonblocking mode: %m")));
#endif
- FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+ FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents);
socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
MyProcPort->sock, NULL, NULL);
latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
@@ -1959,33 +1959,41 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
- /*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
- */
- struct pollfd pollfd;
+ WaitEvent events[FeBeWaitSetNEvents];
+ bool result = true;
int rc;
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
+ /*
+ * It's OK to modify the socket event filter without restoring, because
+ * all FeBeWaitSet socket wait sites do the same.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
- rc = poll(&pollfd, 1, 0);
+ /*
+ * Temporarily silence the latch, because its higher priority event might
+ * hide the socket event we want to poll for.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, NULL);
- if (rc < 0)
+ PG_TRY();
{
- ereport(COMMERROR,
- (errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
- return false;
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
+ for (int i = 0; i < rc; ++i)
+ {
+ if (events[i].events & WL_SOCKET_CLOSED)
+ result = false;
+ }
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
+ PG_FINALLY();
+ {
+ /*
+ * If WaitEventSetWait() reports an error, something must be pretty
+ * seriously wrong, but we should restore the latch on principle.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET,
+ MyLatch);
+ }
+ PG_END_TRY();
- return true;
+ return result;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f9504d3aec..57292b2364 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12129,14 +12129,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index 6b67a2a318..9f168a284f 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -62,6 +62,7 @@ extern WaitEventSet *FeBeWaitSet;
#define FeBeWaitSetSocketPos 0
#define FeBeWaitSetLatchPos 1
+#define FeBeWaitSetNEvents 3
extern int StreamServerPort(int family, const char *hostName,
unsigned short portNumber, const char *unixSocketDir,
--
2.33.1
Hi,
On 2022-01-11 22:59:13 +1300, Thomas Munro wrote:
I considered another idea we discussed: if we see a latch event, clear
it and try again so that other events can be revealed (rince and
repeat), but remember if that happens and set the latch at the end. I
think that still requires PG_FINALLY() if you want to guarantee not to
eat a latch event if WaitEventSetWait() throws. This may be a
theoretical point because things must be pretty broken if
WaitEventSetWait() is throwing, but I don't like an egregious lack of
exception safety on principle.
I don't think this is a problem. Not because of WaitEventSetWait() never
throwing, but because it's "just fine" to have reset the latch in that case.
The error will cause control flow to transfer to the next PG_CATCH site. The
point of latches is to avoid racy checks for events (or sleeps with short
timeouts to handle the races). The documented pattern is:
* for (;;)
* {
* ResetLatch();
* if (work to do)
* Do Stuff();
* WaitLatch();
* }
Latches only work if there's a very limited amount of things happening between
the if (work_to_do) and the WaitLatch().
It definitely is *NOT* be ok to do something like:
for (;;)
{
ResetLatch()
if (work_to_do)
DoStuff();
PG_TRY():
something_that_may_throw();
PG_CATCH():
something_not_throwing();
WaitLatch();
}
For one, elog.c related code might have actually done network IO! During which
the latch very well might be reset. So there's just no way any remotely
reasonable code can rely on preserving latch state across errors.
I considered another idea we discussed: if we see a latch event, clear
it and try again so that other events can be revealed (rince and
repeat), but remember if that happens and set the latch at the end.
The more I think about it, the less I see why we *ever* need to re-arm the
latch in pq_check_connection() in this approach. pq_check_connection() is only
used from from ProcessInterrupts(), and there's plenty things inside
ProcessInterrupts() that can cause latches to be reset (e.g. parallel message
processing causing log messages to be sent to the client, causing network IO,
which obviously can do a latch reset).
It makes sense to use CFI() in a latch loop, but it would have to be at the
bottom or top, not between if (work_to_do) and WaitLatch().
Greetings,
Andres Freund
On Fri, Jan 14, 2022 at 4:35 PM Andres Freund <andres@anarazel.de> wrote:
The more I think about it, the less I see why we *ever* need to re-arm the
latch in pq_check_connection() in this approach. pq_check_connection() is only
used from from ProcessInterrupts(), and there's plenty things inside
ProcessInterrupts() that can cause latches to be reset (e.g. parallel message
processing causing log messages to be sent to the client, causing network IO,
which obviously can do a latch reset).
Thanks for the detailed explanation. I guess I was being overly
cautious and a little myopic, "leave things exactly the way you found
them", so I didn't have to think about any of that. I see now that
the scenario I was worrying about would be a bug in whatever
latch-wait loop happens to reach this code. Alright then, here is
just... one... more... patch, this time consuming any latch that gets
in the way and retrying, with no restore.
Attachments:
v7-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchtext/x-patch; charset=US-ASCII; name=v7-0001-Add-WL_SOCKET_CLOSED-for-socket-shutdown-events.patchDownload
From d0399282e73ebd47dbead19b56586fff8ba3e9d2 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:38:40 +1200
Subject: [PATCH v7 1/2] Add WL_SOCKET_CLOSED for socket shutdown events.
Provide a way for WaitEventSet to report that the remote peer has shut
down its socket, independently of whether there is any buffered data
remaining to be read. This works only on systems where the kernel
exposes that information, namely:
* WAIT_USE_POLL builds using POLLRDHUP, if available
* WAIT_USE_EPOLL builds using EPOLLRDHUP
* WAIT_USE_KQUEUE builds using EV_EOF
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
src/backend/storage/ipc/latch.c | 79 +++++++++++++++++++++++++++++----
src/include/storage/latch.h | 6 ++-
2 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 61c876beff..9a498b0f12 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -841,6 +841,7 @@ FreeWaitEventSet(WaitEventSet *set)
* - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
* can be combined with other WL_SOCKET_* events (on non-Windows
* platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
* - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
*
* Returns the offset in WaitEventSet->events (starting from 0), which can be
@@ -1043,12 +1044,16 @@ WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
else
{
Assert(event->fd != PGINVALID_SOCKET);
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
if (event->events & WL_SOCKET_READABLE)
epoll_ev.events |= EPOLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
}
/*
@@ -1087,12 +1092,18 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
else
{
- Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
pollfd->events = 0;
if (event->events & WL_SOCKET_READABLE)
pollfd->events |= POLLIN;
if (event->events & WL_SOCKET_WRITEABLE)
pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
}
Assert(event->fd != PGINVALID_SOCKET);
@@ -1165,7 +1176,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
Assert(event->events != WL_LATCH_SET || set->latch != NULL);
Assert(event->events == WL_LATCH_SET ||
event->events == WL_POSTMASTER_DEATH ||
- (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
if (event->events == WL_POSTMASTER_DEATH)
{
@@ -1188,9 +1201,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
* old event mask to the new event mask, since kevent treats readable
* and writable as separate events.
*/
- if (old_events & WL_SOCKET_READABLE)
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
old_filt_read = true;
- if (event->events & WL_SOCKET_READABLE)
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
new_filt_read = true;
if (old_events & WL_SOCKET_WRITEABLE)
old_filt_write = true;
@@ -1210,7 +1223,10 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
event);
}
- Assert(count > 0);
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
Assert(count <= 2);
rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
@@ -1525,7 +1541,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd != PGINVALID_SOCKET);
@@ -1543,6 +1561,13 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -1668,7 +1693,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events++;
returned_events++;
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
Assert(cur_event->fd >= 0);
@@ -1679,6 +1706,14 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_READABLE;
}
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
(cur_kqueue_event->filter == EVFILT_WRITE))
{
@@ -1789,7 +1824,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
returned_events++;
}
}
- else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
{
int errflags = POLLHUP | POLLERR | POLLNVAL;
@@ -1809,6 +1846,15 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
if (occurred_events->events != 0)
{
occurred_events->fd = cur_event->fd;
@@ -2015,6 +2061,21 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
}
#endif
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
/*
* Get the number of wait events registered in a given WaitEventSet.
*/
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 3aa7b33834..0dd79d73fa 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -134,10 +134,11 @@ typedef struct Latch
/* avoid having to deal with case on platforms not requiring it */
#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
#endif
-
+#define WL_SOCKET_CLOSED (1 << 7)
#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
WL_SOCKET_WRITEABLE | \
- WL_SOCKET_CONNECTED)
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
typedef struct WaitEvent
{
@@ -180,5 +181,6 @@ extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
pgsocket sock, long timeout, uint32 wait_event_info);
extern void InitializeLatchWaitSet(void);
extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
#endif /* LATCH_H */
--
2.33.1
v7-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchtext/x-patch; charset=US-ASCII; name=v7-0002-Use-WL_SOCKET_CLOSED-for-client_connection_check_.patchDownload
From d0d472018c1c42a1c4a5233ca18718bc90a82448 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Apr 2021 10:48:32 +1200
Subject: [PATCH v7 2/2] Use WL_SOCKET_CLOSED for
client_connection_check_interval.
Previously we used poll() directly to check for a POLLRDHUP event.
Instead, use the WaitEventSet API to poll the socket for
WL_SOCKET_CLOSED, which knows how to detect this condition on many more
operating systems.
Reviewed-by: Zhihong Yu <zyu@yugabyte.com>
Reviewed-by: Maksim Milyutin <milyutinma@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru
---
doc/src/sgml/config.sgml | 6 ++---
src/backend/libpq/pqcomm.c | 46 ++++++++++++++++++------------------
src/backend/utils/misc/guc.c | 7 ++----
src/include/libpq/libpq.h | 1 +
4 files changed, 29 insertions(+), 31 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index afbb6c35e3..4a6a869460 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1012,9 +1012,9 @@ include_dir 'conf.d'
the kernel reports that the connection is closed.
</para>
<para>
- This option is currently available only on systems that support the
- non-standard <symbol>POLLRDHUP</symbol> extension to the
- <symbol>poll</symbol> system call, including Linux.
+ This option relies on kernel events exposed by Linux, macOS, illumos
+ and the BSD family of operating systems, and is not currently available
+ on other systems.
</para>
<para>
If the value is specified without units, it is taken as milliseconds.
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index f05723dc92..1c6e096500 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -204,7 +204,7 @@ pq_init(void)
(errmsg("could not set socket to nonblocking mode: %m")));
#endif
- FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+ FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents);
socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
MyProcPort->sock, NULL, NULL);
latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
@@ -1959,33 +1959,33 @@ pq_settcpusertimeout(int timeout, Port *port)
bool
pq_check_connection(void)
{
-#if defined(POLLRDHUP)
- /*
- * POLLRDHUP is a Linux extension to poll(2) to detect sockets closed by
- * the other end. We don't have a portable way to do that without
- * actually trying to read or write data on other systems. We don't want
- * to read because that would be confused by pipelined queries and COPY
- * data. Perhaps in future we'll try to write a heartbeat message instead.
- */
- struct pollfd pollfd;
+ WaitEvent events[FeBeWaitSetNEvents];
int rc;
- pollfd.fd = MyProcPort->sock;
- pollfd.events = POLLOUT | POLLIN | POLLRDHUP;
- pollfd.revents = 0;
-
- rc = poll(&pollfd, 1, 0);
+ /*
+ * It's OK to modify the socket event filter without restoring, because
+ * all FeBeWaitSet socket wait sites do the same.
+ */
+ ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, WL_SOCKET_CLOSED, NULL);
- if (rc < 0)
+retry:
+ rc = WaitEventSetWait(FeBeWaitSet, 0, events, lengthof(events), 0);
+ for (int i = 0; i < rc; ++i)
{
- ereport(COMMERROR,
- (errcode_for_socket_access(),
- errmsg("could not poll socket: %m")));
- return false;
+ if (events[i].events & WL_SOCKET_CLOSED)
+ return false;
+ if (events[i].events & WL_LATCH_SET)
+ {
+ /*
+ * A latch event might be preventing other events from being
+ * reported. Reset it and poll again. No need to restore it
+ * because no code should expect latches to survive across
+ * CHECK_FOR_INTERRUPTS().
+ */
+ ResetLatch(MyLatch);
+ goto retry;
+ }
}
- else if (rc == 1 && (pollfd.revents & (POLLHUP | POLLRDHUP)))
- return false;
-#endif
return true;
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6fc5cbc09a..21c9c04594 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -12129,14 +12129,11 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
static bool
check_client_connection_check_interval(int *newval, void **extra, GucSource source)
{
-#ifndef POLLRDHUP
- /* Linux only, for now. See pq_check_connection(). */
- if (*newval != 0)
+ if (!WaitEventSetCanReportClosed() && *newval != 0)
{
- GUC_check_errdetail("client_connection_check_interval must be set to 0 on platforms that lack POLLRDHUP.");
+ GUC_check_errdetail("client_connection_check_interval must be set to 0 on this platform");
return false;
}
-#endif
return true;
}
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index f0786e08b4..d348a55812 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -62,6 +62,7 @@ extern WaitEventSet *FeBeWaitSet;
#define FeBeWaitSetSocketPos 0
#define FeBeWaitSetLatchPos 1
+#define FeBeWaitSetNEvents 3
extern int StreamServerPort(int family, const char *hostName,
unsigned short portNumber, const char *unixSocketDir,
--
2.33.1
On Fri, Jan 14, 2022 at 7:30 PM Thomas Munro <thomas.munro@gmail.com> wrote:
On Fri, Jan 14, 2022 at 4:35 PM Andres Freund <andres@anarazel.de> wrote:
The more I think about it, the less I see why we *ever* need to re-arm the
latch in pq_check_connection() in this approach. pq_check_connection() is only
used from from ProcessInterrupts(), and there's plenty things inside
ProcessInterrupts() that can cause latches to be reset (e.g. parallel message
processing causing log messages to be sent to the client, causing network IO,
which obviously can do a latch reset).Thanks for the detailed explanation. I guess I was being overly
cautious and a little myopic, "leave things exactly the way you found
them", so I didn't have to think about any of that. I see now that
the scenario I was worrying about would be a bug in whatever
latch-wait loop happens to reach this code. Alright then, here is
just... one... more... patch, this time consuming any latch that gets
in the way and retrying, with no restore.
And pushed.
My excuse for taking so long to get this into the tree is that it was
tedious to retest this thing across so many OSes and determine that it
really does behave reliably for killed processes AND lost
processes/yanked cables/keepalive timeout, even with buffered data.
In the process I learned a bit more about TCP and got POLLRDHUP added
to FreeBSD (not that it matters for PostgreSQL 15, now that we can use
EV_EOF). As for the FD_CLOSE behaviour I thought I saw on Windows
upthread: it was a mirage, caused by the RST thing. There may be some
other way to implement this feature on that TCP implementation, but I
don't know what it is.