From 630c7d5e769dda6676eeef4ad56f8ed8ab9878e0 Mon Sep 17 00:00:00 2001 From: "suyu.cmj" Date: Thu, 9 Sep 2021 07:37:01 +0000 Subject: [PATCH] Handle cancel query interrupts during client read/write --- src/backend/tcop/postgres.c | 37 ++++++++ src/test/perl/PostgresNode.pm | 30 ++++++ .../t/026_recovery_stuck_on_client_write.pl | 102 +++++++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 src/test/recovery/t/026_recovery_stuck_on_client_write.pl diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 58b5960..a7570f4 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -206,6 +206,7 @@ static void drop_unnamed_stmt(void); static void log_disconnections(int code, Datum arg); static void enable_statement_timeout(void); static void disable_statement_timeout(void); +static void ProcessClientReadWriteCancelInterrupt(void); /* ---------------------------------------------------------------- @@ -493,6 +494,9 @@ ProcessClientReadInterrupt(bool blocked) { int save_errno = errno; + /* Process cancel query interrupts */ + ProcessClientReadWriteCancelInterrupt(); + if (DoingCommandRead) { /* Check for general interrupts that arrived before/while reading */ @@ -539,6 +543,9 @@ ProcessClientWriteInterrupt(bool blocked) { int save_errno = errno; + /* Process cancel query interrupts */ + ProcessClientReadWriteCancelInterrupt(); + if (ProcDiePending) { /* @@ -4958,3 +4965,33 @@ disable_statement_timeout(void) if (get_timeout_active(STATEMENT_TIMEOUT)) disable_timeout(STATEMENT_TIMEOUT, false); } + +/* + * process cancel query request during client read/write + * in original way, cancel query requests are all ignored during client read/write, + * which is not appropriate. for example, if a query which generated recovery conflict + * can't be canceled when it is reading from/writing to client, the recovery process of + * startup will be delay infinitely by a stuck client. Improve it in this way: + * 1) if cancel query request is from db itself, then we handle it right now, and cancel request + * is treated as a terminate request, connection will be terminated; + * 2) if cancel query request is from client, then we ignore it as original way, client needs + * to send a termiante request to cancel it truly. + */ +static void +ProcessClientReadWriteCancelInterrupt(void) +{ + bool handle_cancel_request = false; + bool stmt_timeout_occurred; + bool lock_timeout_occurred; + + lock_timeout_occurred = get_timeout_indicator(LOCK_TIMEOUT, true); + stmt_timeout_occurred = get_timeout_indicator(STATEMENT_TIMEOUT, true); + + /* cancel request need to be handled */ + if (lock_timeout_occurred || stmt_timeout_occurred || RecoveryConflictPending) + handle_cancel_request = true; + + /* treat cancel request as terminate request */ + if (!ProcDiePending && QueryCancelPending && handle_cancel_request) + ProcDiePending = true; +} \ No newline at end of file diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index c59da75..82da7f6 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -844,6 +844,36 @@ sub start =pod +=item $node->find_child(process_name) +Find child process base on process name + +=cut + +sub find_child +{ + my ($self, $process_name) = @_; + my $pid=0; + my @childs=`ps -o pid,cmd --ppid $self->{_pid}` or die "can't run ps! $! \n"; + + foreach my $child (@childs) + { + $child =~ s/^\s+|\s+$//g; + my $pos = index($child, $process_name); + if ($pos > 0) + { + $pos = index($child, ' '); + $pid = substr($child, 0, $pos); + $pid =~ s/^\s+|\s+$//g; + print "### Killing child process \"$pid\", \"$child\" using signal 9\n"; + last; + } + } + + return $pid; +} + +=pod + =item $node->kill9() Send SIGKILL (signal 9) to the postmaster. diff --git a/src/test/recovery/t/026_recovery_stuck_on_client_write.pl b/src/test/recovery/t/026_recovery_stuck_on_client_write.pl new file mode 100644 index 0000000..fb227c5 --- /dev/null +++ b/src/test/recovery/t/026_recovery_stuck_on_client_write.pl @@ -0,0 +1,102 @@ +# Test for backend process gets stuck on client_write +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use IPC::Run; +use Test::More tests=>5; + +# primary node +my $node_primary = PostgresNode->new('primary'); +$node_primary->init( + allows_streaming => 1, + auth_extra => [ '--create-role', 'repl_role' ]); +$node_primary->start; + +# standby node +my $backup_name = 'my_backup'; +# Take backup +$node_primary->backup($backup_name); +# Create streaming standby linking to primary +my $node_standby = PostgresNode->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +# start standby +$node_standby->append_conf('postgresql.conf', "max_standby_streaming_delay = 5000"); +$node_standby->append_conf('postgresql.conf', "max_standby_archive_delay = 5000"); +$node_standby->append_conf('postgresql.conf', "hot_standby_feedback = off"); +$node_standby->append_conf('postgresql.conf', "logging_collector = on"); +$node_standby->append_conf('postgresql.conf', "log_directory = 'log'"); +$node_standby->start; + +# Wait for standbys to catch up +$node_primary->safe_psql('postgres', 'CREATE TABLE test_table(val integer);'); +# insert more data so that the output buffer can be filled up when select from the table +$node_primary->safe_psql('postgres', "INSERT INTO test_table(val) SELECT generate_series(1,10000000) as newwal"); +$node_primary->safe_psql('postgres', "checkpoint;"); +$node_primary->wait_for_catchup($node_standby, 'replay', $node_primary->lsn('insert')); +my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM test_table"); +print "standby result: $result\n"; +ok($result == 10000000, 'check streamed content on standby'); + +my $connstr = $node_primary->connstr; +my @res = `nohup psql -d "$connstr" -c "begin; select pg_sleep(2); select * from test_table;" > tmp_check/myout.file 2>&1 &`; +my $host = $node_primary->host; +my $port = $node_primary->port; +my $client = readpipe("ps -ef | grep \"psql\" | grep \"$host\" | grep \"$port\" | grep -v grep | awk '{print \$2}'"); +# stop client and backend will get stuck on secure_write() +print "ready to stop primary client pid: $client\n"; +sleep 1; +@res = `kill -stop $client`; +my $backend = $node_primary->find_child("SELECT"); +print "backend pid:$backend\n"; + +# cancel the query +sleep 2; +$node_primary->safe_psql('postgres', "select pg_cancel_backend($backend)"); +# check the query +my $pid_new = $node_primary->find_child("SELECT"); +print "backend pid:$pid_new\n"; +ok($pid_new == $backend, "backend can't be canceled"); +@res = `kill -cont $client`; +@res = `kill -s 9 $client`; + +# create recovery conflict +$connstr = $node_standby->connstr; +@res = `nohup psql -d "$connstr" -c "begin; select pg_sleep(2); select * from test_table;" > tmp_check/myout.file 2>&1 &`; +$host = $node_standby->host; +$port = $node_standby->port; +$client = readpipe("ps -ef | grep \"psql\" | grep \"$host\" | grep \"$port\" | grep -v grep | awk '{print \$2}'"); +print "ready to stop standby client pid: $client\n"; +sleep 1; +@res = `kill -stop $client`; + +# delete data and do vacuum in primary node +$node_primary->safe_psql("postgres", "delete from test_table"); +$node_primary->safe_psql("postgres", "vacuum test_table"); +# cancel the query +$backend = $node_standby->find_child("SELECT"); +$node_standby->safe_psql('postgres', "select pg_cancel_backend($backend)"); +# check the query +sleep 2; +$pid_new = $node_standby->find_child("SELECT"); +print "backend pid:$pid_new\n"; +ok($pid_new == $backend, "backend can't be canceled by user"); + +# wait until recovery conflict generate +sleep 3; +$pid_new = $node_standby->find_child("SELECT"); +print "backend pid:$pid_new\n"; +ok($pid_new == 0, "backend is canceled"); +@res = `kill -cont $client`; +@res = `kill -s 9 $client`; + +my $basedir = $node_standby->basedir(); +my $logdir = "$basedir/pgdata/log"; +my @exits = `grep -rn "terminating connection due to conflict with recovery" $logdir`; +my $found = @exits; +ok($found == 1, "backend is canceled due to recovery conflict"); + +$node_standby->stop; +$node_primary->stop; -- 1.8.3.1