PROBLEM Service Alert: hostname/check_postgres_old_transaction is CRITICAL **
Hi all,
From the past few days i am getting this old transation alert from some of
my DB slaves. I am using EDB9.2.7.18 on on CentOS6.5 ( Final ). From my
app boxes i got some of the transactions remains stuck for 2-3 hours which
took ~ < 60 seconds in database. Below are the details :-
-- DB execution time
explain analyze select * from appusage_func('2014-09-19 15:26:15.662');
-[ RECORD 1
]--------------------------------------------------------------------------------------------------------------------------------------
QUERY PLAN | Function Scan on appusage_func (cost=0.25..10.25 rows=1000
width=12) (actual time=34060.657..34813.542 rows=7362310 loops=1)
-[ RECORD 2
]--------------------------------------------------------------------------------------------------------------------------------------
QUERY PLAN | Total runtime: 35174.296 ms
It returns approx *311 MB* data to the client servers.
root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499
localhost:48848 ESTABLISHED 45355/postgres
root > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306,
"\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"...,
8192) = 8192
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0
Then after 2-3 hours it got completed automatically with below stacktrace :
# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0
) = -1 ECONNRESET (Connection reset by peer)
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No
such file or directory)
write(2, "\0\0J\1+\261\0\0t\35\230\2\1\0\0\0002014-09-19 18:04"..., 346) =
346
write(2, "\0\0.\1+\261\0\0t18:\1\0\0\0002014-09-19 18:04"..., 318) = 318
close(306) = 0
stat("base/pgsql_tmp/pgsql_tmp45355.36", {st_mode=S_IFREG|0600,
st_size=161970820, ...}) = 0
unlink("base/pgsql_tmp/pgsql_tmp45355.36") = 0
sendto(9, "\17\0\0\0\30\0\0\0\230\275\22\0\0\0\0\0\204z\247\t\0\0\0\0", 24,
0, NULL, 0) = 24
write(2, "\0\0\256\0+\261\0\0tFf\0\1\0\0\0002014-09-19 18:04"..., 190) = 190
close(305) = 0
close(264) = 0
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\1\0\0\0\1\0\0\0b\366\7\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0x\1\0\0\230\275\22\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
376, 0, NULL, 0) = 376
sendto(9,
"\f\0\0\0000\0\0\0\230\275\22\0\1\0\0\0e\346_\0002047\1\0\0\0\0\0\0\0"...,
48, 0, NULL, 0) = 48
write(2, "\0\0\325\0+\261\0\0t\35\230\2\1\0\0\0002014-09-19 18:04"..., 229)
= 229
exit_group(1) = ?
Process 45355 detached
I am sure there will be network issue but what might be the reason for
that. It comes 2-3 times daily.
Please let me know if any other information is required.
Thanks
On 2014-9-19 20:33 , Adarsh Sharma wrote:
It returns approx *311 MB* data to the client servers.
root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499 localhost:48848 ESTABLISHED 45355/postgresroot > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306, "\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"..., 8192) = 8192
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"..., 8192, 0, NULL, 0Then after 2-3 hours it got completed automatically with below stacktrace :
# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"..., 8192, 0, NULL, 0) = -1 ECONNRESET (Connection reset by peer)
I'd guess that the receiving program crashes. It might not be able to stomach the 311 MB that's being sent to it, maybe it's expanding, and swapping, until it's killed by the OOM killer?
Doesn't look like a postgres problem to me. postgres is stuck trying to send data... try stracing the client to see what it does? (use "lsof -i :48848" or whatever the port number of the remote is to find the pid).
--
Jan-Pieter Cornet <johnpc@xs4all.net>
"Any sufficiently advanced incompetence is indistinguishable from malice."
- Grey's Law
Thanks Jan.!! Will check and update you all the findings.
Cheers
On Sat, Sep 20, 2014 at 4:17 AM, Jan-Pieter Cornet <johnpc@xs4all.net>
wrote:
Show quoted text
On 2014-9-19 20:33 , Adarsh Sharma wrote:
It returns approx *311 MB* data to the client servers.
root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499 localhost:48848ESTABLISHED 45355/postgres
root > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306,"\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"...,
8192) = 8192sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0
Then after 2-3 hours it got completed automatically with below
stacktrace :
# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,8192, 0, NULL, 0
) = -1 ECONNRESET (Connection reset by peer)
I'd guess that the receiving program crashes. It might not be able to
stomach the 311 MB that's being sent to it, maybe it's expanding, and
swapping, until it's killed by the OOM killer?Doesn't look like a postgres problem to me. postgres is stuck trying to
send data... try stracing the client to see what it does? (use "lsof -i
:48848" or whatever the port number of the remote is to find the pid).--
Jan-Pieter Cornet <johnpc@xs4all.net>
"Any sufficiently advanced incompetence is indistinguishable from malice."
- Grey's Law