several problems in pg_receivexlog
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.
ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.
Regards,
--
Fujii Masao
Attachments:
fix_pgreceivexlog_problems_v1.patchapplication/octet-stream; name=fix_pgreceivexlog_problems_v1.patchDownload
*** a/src/bin/pg_basebackup/receivelog.c
--- b/src/bin/pg_basebackup/receivelog.c
***************
*** 38,43 ****
--- 38,47 ----
#define STREAMING_HEADER_SIZE (1+sizeof(WalDataMessageHeader))
#define STREAMING_KEEPALIVE_SIZE (1+sizeof(PrimaryKeepaliveMessage))
+ /* Kernel FD for an open WAL file */
+ static int walfile = -1;
+
+
/*
* Open a new WAL file in the specified directory. Store the name
* (not including the full directory) in namebuf. Assumes there is
***************
*** 96,101 **** open_walfile(XLogRecPtr startpoint, uint32 timeline, char *basedir, char *namebu
--- 100,106 ----
{
fprintf(stderr, _("%s: could not pad WAL segment %s: %s\n"),
progname, fn, strerror(errno));
+ free(zerobuf);
close(f);
unlink(fn);
return -1;
***************
*** 120,126 **** open_walfile(XLogRecPtr startpoint, uint32 timeline, char *basedir, char *namebu
* completed writing the whole segment.
*/
static bool
! close_walfile(int walfile, char *basedir, char *walname, bool segment_complete)
{
off_t currpos = lseek(walfile, 0, SEEK_CUR);
--- 125,131 ----
* completed writing the whole segment.
*/
static bool
! close_walfile(char *basedir, char *walname, bool segment_complete)
{
off_t currpos = lseek(walfile, 0, SEEK_CUR);
***************
*** 142,149 **** close_walfile(int walfile, char *basedir, char *walname, bool segment_complete)
--- 147,156 ----
{
fprintf(stderr, _("%s: could not close file %s: %s\n"),
progname, walname, strerror(errno));
+ walfile = -1;
return false;
}
+ walfile = -1;
/*
* Rename the .partial file only if we've completed writing the whole
***************
*** 270,276 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
char current_walfile_name[MAXPGPATH];
PGresult *res;
char *copybuf = NULL;
- int walfile = -1;
int64 last_status = -1;
XLogRecPtr blockpos = InvalidXLogRecPtr;
--- 277,282 ----
***************
*** 315,320 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
--- 321,327 ----
{
fprintf(stderr, _("%s: could not start replication: %s\n"),
progname, PQresultErrorMessage(res));
+ PQclear(res);
return false;
}
PQclear(res);
***************
*** 341,349 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
*/
if (stream_stop && stream_stop(blockpos, timeline, false))
{
! if (walfile != -1)
/* Potential error message is written by close_walfile */
! return close_walfile(walfile, basedir, current_walfile_name, rename_partial);
return true;
}
--- 348,356 ----
*/
if (stream_stop && stream_stop(blockpos, timeline, false))
{
! if (walfile != -1 && !close_walfile(basedir, current_walfile_name, rename_partial))
/* Potential error message is written by close_walfile */
! goto error;
return true;
}
***************
*** 370,376 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: could not send feedback packet: %s"),
progname, PQerrorMessage(conn));
! return false;
}
last_status = now;
--- 377,383 ----
{
fprintf(stderr, _("%s: could not send feedback packet: %s"),
progname, PQerrorMessage(conn));
! goto error;
}
last_status = now;
***************
*** 421,434 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: select() failed: %s\n"),
progname, strerror(errno));
! return false;
}
/* Else there is actually data on the socket */
if (PQconsumeInput(conn) == 0)
{
fprintf(stderr, _("%s: could not receive data from WAL stream: %s\n"),
progname, PQerrorMessage(conn));
! return false;
}
continue;
}
--- 428,441 ----
{
fprintf(stderr, _("%s: select() failed: %s\n"),
progname, strerror(errno));
! goto error;
}
/* Else there is actually data on the socket */
if (PQconsumeInput(conn) == 0)
{
fprintf(stderr, _("%s: could not receive data from WAL stream: %s\n"),
progname, PQerrorMessage(conn));
! goto error;
}
continue;
}
***************
*** 439,445 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: could not read copy data: %s\n"),
progname, PQerrorMessage(conn));
! return false;
}
if (copybuf[0] == 'k')
{
--- 446,452 ----
{
fprintf(stderr, _("%s: could not read copy data: %s\n"),
progname, PQerrorMessage(conn));
! goto error;
}
if (copybuf[0] == 'k')
{
***************
*** 451,457 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: keepalive message is incorrect size: %d\n"),
progname, r);
! return false;
}
continue;
}
--- 458,464 ----
{
fprintf(stderr, _("%s: keepalive message is incorrect size: %d\n"),
progname, r);
! goto error;
}
continue;
}
***************
*** 459,471 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
progname, copybuf[0]);
! return false;
}
if (r < STREAMING_HEADER_SIZE + 1)
{
fprintf(stderr, _("%s: streaming header too small: %d\n"),
progname, r);
! return false;
}
/* Extract WAL location for this block */
--- 466,478 ----
{
fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
progname, copybuf[0]);
! goto error;
}
if (r < STREAMING_HEADER_SIZE + 1)
{
fprintf(stderr, _("%s: streaming header too small: %d\n"),
progname, r);
! goto error;
}
/* Extract WAL location for this block */
***************
*** 483,489 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: received xlog record for offset %u with no file open\n"),
progname, xlogoff);
! return false;
}
}
else
--- 490,496 ----
{
fprintf(stderr, _("%s: received xlog record for offset %u with no file open\n"),
progname, xlogoff);
! goto error;
}
}
else
***************
*** 494,500 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: got WAL data offset %08x, expected %08x\n"),
progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
! return false;
}
}
--- 501,507 ----
{
fprintf(stderr, _("%s: got WAL data offset %08x, expected %08x\n"),
progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
! goto error;
}
}
***************
*** 520,526 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
basedir, current_walfile_name);
if (walfile == -1)
/* Error logged by open_walfile */
! return false;
}
if (write(walfile,
--- 527,533 ----
basedir, current_walfile_name);
if (walfile == -1)
/* Error logged by open_walfile */
! goto error;
}
if (write(walfile,
***************
*** 532,538 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
bytes_to_write,
current_walfile_name,
strerror(errno));
! return false;
}
/* Write was successful, advance our position */
--- 539,545 ----
bytes_to_write,
current_walfile_name,
strerror(errno));
! goto error;
}
/* Write was successful, advance our position */
***************
*** 544,554 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
/* Did we reach the end of a WAL segment? */
if (blockpos % XLOG_SEG_SIZE == 0)
{
! if (!close_walfile(walfile, basedir, current_walfile_name, false))
/* Error message written in close_walfile() */
! return false;
- walfile = -1;
xlogoff = 0;
if (stream_stop != NULL)
--- 551,560 ----
/* Did we reach the end of a WAL segment? */
if (blockpos % XLOG_SEG_SIZE == 0)
{
! if (walfile != -1 && !close_walfile(basedir, current_walfile_name, false))
/* Error message written in close_walfile() */
! goto error;
xlogoff = 0;
if (stream_stop != NULL)
***************
*** 577,584 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
{
fprintf(stderr, _("%s: unexpected termination of replication stream: %s\n"),
progname, PQresultErrorMessage(res));
! return false;
}
PQclear(res);
return true;
}
--- 583,604 ----
{
fprintf(stderr, _("%s: unexpected termination of replication stream: %s\n"),
progname, PQresultErrorMessage(res));
! goto error;
}
PQclear(res);
+
+ if (copybuf != NULL)
+ PQfreemem(copybuf);
+ if (walfile != -1 && close(walfile) != 0)
+ fprintf(stderr, _("%s: could not close file %s: %s\n"),
+ progname, current_walfile_name, strerror(errno));
return true;
+
+ error:
+ if (copybuf != NULL)
+ PQfreemem(copybuf);
+ if (walfile != -1 && close(walfile) != 0)
+ fprintf(stderr, _("%s: could not close file %s: %s\n"),
+ progname, current_walfile_name, strerror(errno));
+ return false;
}
*** a/src/bin/pg_basebackup/streamutil.c
--- b/src/bin/pg_basebackup/streamutil.c
***************
*** 143,148 **** GetConnection(void)
--- 143,160 ----
tmpconn = PQconnectdbParams(keywords, values, true);
+ /*
+ * If there is too little memory even to allocate the PGconn object
+ * and PQconnectdbParams returns NULL, we call exit(1) directly.
+ */
+ if (!tmpconn)
+ {
+ fprintf(stderr, _("%s: could not connect to server\n"),
+ progname);
+ PQfinish(tmpconn);
+ exit(1);
+ }
+
if (PQstatus(tmpconn) == CONNECTION_BAD &&
PQconnectionNeedsPassword(tmpconn) &&
dbgetpassword != -1)
***************
*** 154,161 **** GetConnection(void)
if (PQstatus(tmpconn) != CONNECTION_OK)
{
! fprintf(stderr, _("%s: could not connect to server: %s"),
progname, PQerrorMessage(tmpconn));
return NULL;
}
--- 166,176 ----
if (PQstatus(tmpconn) != CONNECTION_OK)
{
! fprintf(stderr, _("%s: could not connect to server: %s\n"),
progname, PQerrorMessage(tmpconn));
+ PQfinish(tmpconn);
+ free(values);
+ free(keywords);
return NULL;
}
On Mon, Jul 9, 2012 at 8:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.
While I don't doubt that what you've found are real problems, would
you mind explaining exactly what they are, so we don't have to
reverse-engineer the patch to figure that out?
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Tue, Jul 10, 2012 at 6:27 AM, Magnus Hagander <magnus@hagander.net> wrote:
On Mon, Jul 9, 2012 at 8:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.While I don't doubt that what you've found are real problems, would
you mind explaining exactly what they are, so we don't have to
reverse-engineer the patch to figure that out?
Yep.
When an error happens after replication connection has been established,
pg_receivexlog doesn't close an open file descriptor and release an allocated
memory area. This was harmless before 16282ae688de2b320cf176e9be8a89e4dfc60698
because pg_receivexlog exits immediately when an error happens. But
currently in an error case, pg_receivexlog tries reconnecting to the server
infinitely, so file descriptors and memory would leak. I think this is problem
and should be fixed. The patch which I submitted yesterday changes
pg_receivexlog so that it closes the open file and frees the memory area
before reconnecting to the server.
Regards,
--
Fujii Masao
On Tue, Jul 10, 2012 at 3:23 AM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.
While pg_basebackup background process is streaming WAL records,
if its replication connection is terminated (e.g., walsender in the server
is accidentally terminated by SIGTERM signal), pg_basebackup ends
up failing to include all required WAL files in the backup. The problem
is that, in this case, pg_basebackup doesn't emit any error message at all.
So an user might misunderstand that a base backup has been successfully
taken even though it doesn't include all required WAL files.
To fix this problem, I think that, when the replication connection is
terminated, ReceiveXlogStream() should check whether we've already
reached the stop point by calling stream_stop() before returning TRUE.
If we've not yet (this means that we've not received all required WAL
files yet), ReceiveXlogStream() should return FALSE and
pg_basebackup should emit an error message. Comments?
Regards,
--
Fujii Masao
On Tue, Jul 10, 2012 at 6:45 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Tue, Jul 10, 2012 at 6:27 AM, Magnus Hagander <magnus@hagander.net> wrote:
On Mon, Jul 9, 2012 at 8:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.While I don't doubt that what you've found are real problems, would
you mind explaining exactly what they are, so we don't have to
reverse-engineer the patch to figure that out?Yep.
When an error happens after replication connection has been established,
pg_receivexlog doesn't close an open file descriptor and release an allocated
memory area. This was harmless before 16282ae688de2b320cf176e9be8a89e4dfc60698
because pg_receivexlog exits immediately when an error happens. But
currently in an error case, pg_receivexlog tries reconnecting to the server
infinitely, so file descriptors and memory would leak. I think this is problem
and should be fixed. The patch which I submitted yesterday changes
pg_receivexlog so that it closes the open file and frees the memory area
before reconnecting to the server.
Thanks. I get it now, and this explains why I didn't see it before - I
didn't check properly after we added the loop mode. Patch applied with
minor changes (e.g. there's no point in doing PQfinish(tmpconn) right
after you've verified tmpconn is NULL)
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Tue, Jul 10, 2012 at 7:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Tue, Jul 10, 2012 at 3:23 AM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.While pg_basebackup background process is streaming WAL records,
if its replication connection is terminated (e.g., walsender in the server
is accidentally terminated by SIGTERM signal), pg_basebackup ends
up failing to include all required WAL files in the backup. The problem
is that, in this case, pg_basebackup doesn't emit any error message at all.
So an user might misunderstand that a base backup has been successfully
taken even though it doesn't include all required WAL files.
Ouch. That is definitely a bug if it behaves that way.
To fix this problem, I think that, when the replication connection is
terminated, ReceiveXlogStream() should check whether we've already
reached the stop point by calling stream_stop() before returning TRUE.
If we've not yet (this means that we've not received all required WAL
files yet), ReceiveXlogStream() should return FALSE and
pg_basebackup should emit an error message. Comments?
Doesn't it already return false because it detects the error of the
connection? What's the codepath where we end up returning true even
though we had a connection failure? Shouldn't that end up under the
"could not read copy data" branch, which already returns false?
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Thu, Jul 12, 2012 at 8:39 PM, Magnus Hagander <magnus@hagander.net> wrote:
On Tue, Jul 10, 2012 at 7:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Tue, Jul 10, 2012 at 3:23 AM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.While pg_basebackup background process is streaming WAL records,
if its replication connection is terminated (e.g., walsender in the server
is accidentally terminated by SIGTERM signal), pg_basebackup ends
up failing to include all required WAL files in the backup. The problem
is that, in this case, pg_basebackup doesn't emit any error message at all.
So an user might misunderstand that a base backup has been successfully
taken even though it doesn't include all required WAL files.Ouch. That is definitely a bug if it behaves that way.
To fix this problem, I think that, when the replication connection is
terminated, ReceiveXlogStream() should check whether we've already
reached the stop point by calling stream_stop() before returning TRUE.
If we've not yet (this means that we've not received all required WAL
files yet), ReceiveXlogStream() should return FALSE and
pg_basebackup should emit an error message. Comments?Doesn't it already return false because it detects the error of the
connection? What's the codepath where we end up returning true even
though we had a connection failure? Shouldn't that end up under the
"could not read copy data" branch, which already returns false?
You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.
Regards,
--
Fujii Masao
On Thu, Jul 12, 2012 at 6:07 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Thu, Jul 12, 2012 at 8:39 PM, Magnus Hagander <magnus@hagander.net> wrote:
On Tue, Jul 10, 2012 at 7:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Tue, Jul 10, 2012 at 3:23 AM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.While pg_basebackup background process is streaming WAL records,
if its replication connection is terminated (e.g., walsender in the server
is accidentally terminated by SIGTERM signal), pg_basebackup ends
up failing to include all required WAL files in the backup. The problem
is that, in this case, pg_basebackup doesn't emit any error message at all.
So an user might misunderstand that a base backup has been successfully
taken even though it doesn't include all required WAL files.Ouch. That is definitely a bug if it behaves that way.
To fix this problem, I think that, when the replication connection is
terminated, ReceiveXlogStream() should check whether we've already
reached the stop point by calling stream_stop() before returning TRUE.
If we've not yet (this means that we've not received all required WAL
files yet), ReceiveXlogStream() should return FALSE and
pg_basebackup should emit an error message. Comments?Doesn't it already return false because it detects the error of the
connection? What's the codepath where we end up returning true even
though we had a connection failure? Shouldn't that end up under the
"could not read copy data" branch, which already returns false?You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.
Oh. Because the server does a graceful shutdown. D'uh, of course.
Then yes, your suggested fix seems like a good one.
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Fri, Jul 13, 2012 at 1:15 AM, Magnus Hagander <magnus@hagander.net> wrote:
On Thu, Jul 12, 2012 at 6:07 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Thu, Jul 12, 2012 at 8:39 PM, Magnus Hagander <magnus@hagander.net> wrote:
On Tue, Jul 10, 2012 at 7:03 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Tue, Jul 10, 2012 at 3:23 AM, Fujii Masao <masao.fujii@gmail.com> wrote:
Hi,
I found several problems in pg_receivexlog, e.g., memory leaks,
file-descripter leaks, ..etc. The attached patch fixes these problems.ISTM there are still some other problems in pg_receivexlog, so I'll
read it deeply later.While pg_basebackup background process is streaming WAL records,
if its replication connection is terminated (e.g., walsender in the server
is accidentally terminated by SIGTERM signal), pg_basebackup ends
up failing to include all required WAL files in the backup. The problem
is that, in this case, pg_basebackup doesn't emit any error message at all.
So an user might misunderstand that a base backup has been successfully
taken even though it doesn't include all required WAL files.Ouch. That is definitely a bug if it behaves that way.
To fix this problem, I think that, when the replication connection is
terminated, ReceiveXlogStream() should check whether we've already
reached the stop point by calling stream_stop() before returning TRUE.
If we've not yet (this means that we've not received all required WAL
files yet), ReceiveXlogStream() should return FALSE and
pg_basebackup should emit an error message. Comments?Doesn't it already return false because it detects the error of the
connection? What's the codepath where we end up returning true even
though we had a connection failure? Shouldn't that end up under the
"could not read copy data" branch, which already returns false?You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.Oh. Because the server does a graceful shutdown. D'uh, of course.
Then yes, your suggested fix seems like a good one.
Attached patch adds the fix.
Also I found I had forgotten to set the file descriptor to -1 at the end of
ReceiveXlogStream(), in previously-committed my patch. Attached patch
fixes this problem.
Regards,
--
Fujii Masao
Attachments:
pgreceivexlog_check_stoppoint_v1.patchapplication/octet-stream; name=pgreceivexlog_check_stoppoint_v1.patchDownload
*** a/src/bin/pg_basebackup/receivelog.c
--- b/src/bin/pg_basebackup/receivelog.c
***************
*** 587,597 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
--- 587,606 ----
}
PQclear(res);
+ /* Complain if we've not reached stop point yet */
+ if (stream_stop != NULL && !stream_stop(blockpos, timeline, false))
+ {
+ fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
+ progname);
+ goto error;
+ }
+
if (copybuf != NULL)
PQfreemem(copybuf);
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file %s: %s\n"),
progname, current_walfile_name, strerror(errno));
+ walfile = -1;
return true;
error:
***************
*** 600,604 **** error:
--- 609,614 ----
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file %s: %s\n"),
progname, current_walfile_name, strerror(errno));
+ walfile = -1;
return false;
}
Excerpts from Magnus Hagander's message of jue jul 12 07:35:11 -0400 2012:
On Tue, Jul 10, 2012 at 6:45 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
When an error happens after replication connection has been established,
pg_receivexlog doesn't close an open file descriptor and release an allocated
memory area. This was harmless before 16282ae688de2b320cf176e9be8a89e4dfc60698
because pg_receivexlog exits immediately when an error happens. But
currently in an error case, pg_receivexlog tries reconnecting to the server
infinitely, so file descriptors and memory would leak. I think this is problem
and should be fixed. The patch which I submitted yesterday changes
pg_receivexlog so that it closes the open file and frees the memory area
before reconnecting to the server.Thanks. I get it now, and this explains why I didn't see it before - I
didn't check properly after we added the loop mode. Patch applied with
minor changes (e.g. there's no point in doing PQfinish(tmpconn) right
after you've verified tmpconn is NULL)
For some reason, Magnus neglected to backpatch this to 9.2, so I just
did.
--
Álvaro Herrera <alvherre@commandprompt.com>
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support
Excerpts from Fujii Masao's message of mar jul 17 13:58:38 -0400 2012:
You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.Oh. Because the server does a graceful shutdown. D'uh, of course.
Then yes, your suggested fix seems like a good one.
Attached patch adds the fix.
Also I found I had forgotten to set the file descriptor to -1 at the end of
ReceiveXlogStream(), in previously-committed my patch. Attached patch
fixes this problem.
This hasn't been committed yet AFAICT, and it probably needs a refresh
now after my changes to pg_basebackup. Please update the patch. Also,
if this is not in the Open Items list, please put it there so that we
don't forget it before the 9.2 release.
--
Álvaro Herrera <alvherre@commandprompt.com>
The PostgreSQL Company - Command Prompt, Inc.
PostgreSQL Replication, Consulting, Custom Development, 24x7 support
On Wed, Aug 1, 2012 at 12:09 AM, Alvaro Herrera
<alvherre@commandprompt.com> wrote:
Excerpts from Fujii Masao's message of mar jul 17 13:58:38 -0400 2012:
You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.Oh. Because the server does a graceful shutdown. D'uh, of course.
Then yes, your suggested fix seems like a good one.
Attached patch adds the fix.
Also I found I had forgotten to set the file descriptor to -1 at the end of
ReceiveXlogStream(), in previously-committed my patch. Attached patch
fixes this problem.This hasn't been committed yet AFAICT, and it probably needs a refresh
now after my changes to pg_basebackup. Please update the patch.
I attached the updated version.
Also,
if this is not in the Open Items list, please put it there so that we
don't forget it before the 9.2 release.
Yep, done.
Regards,
--
Fujii Masao
Attachments:
pgreceivexlog_check_stoppoint_v2.patchapplication/octet-stream; name=pgreceivexlog_check_stoppoint_v2.patchDownload
*** a/src/bin/pg_basebackup/receivelog.c
--- b/src/bin/pg_basebackup/receivelog.c
***************
*** 611,621 **** ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline,
--- 611,630 ----
}
PQclear(res);
+ /* Complain if we've not reached stop point yet */
+ if (stream_stop != NULL && !stream_stop(blockpos, timeline, false))
+ {
+ fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
+ progname);
+ goto error;
+ }
+
if (copybuf != NULL)
PQfreemem(copybuf);
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file %s: %s\n"),
progname, current_walfile_name, strerror(errno));
+ walfile = -1;
return true;
error:
***************
*** 624,628 **** error:
--- 633,638 ----
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file %s: %s\n"),
progname, current_walfile_name, strerror(errno));
+ walfile = -1;
return false;
}
On Tue, Jul 31, 2012 at 5:06 PM, Alvaro Herrera
<alvherre@commandprompt.com> wrote:
Excerpts from Magnus Hagander's message of jue jul 12 07:35:11 -0400 2012:
On Tue, Jul 10, 2012 at 6:45 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
When an error happens after replication connection has been established,
pg_receivexlog doesn't close an open file descriptor and release an allocated
memory area. This was harmless before 16282ae688de2b320cf176e9be8a89e4dfc60698
because pg_receivexlog exits immediately when an error happens. But
currently in an error case, pg_receivexlog tries reconnecting to the server
infinitely, so file descriptors and memory would leak. I think this is problem
and should be fixed. The patch which I submitted yesterday changes
pg_receivexlog so that it closes the open file and frees the memory area
before reconnecting to the server.Thanks. I get it now, and this explains why I didn't see it before - I
didn't check properly after we added the loop mode. Patch applied with
minor changes (e.g. there's no point in doing PQfinish(tmpconn) right
after you've verified tmpconn is NULL)For some reason, Magnus neglected to backpatch this to 9.2, so I just
did.
Thanks. I believe that was just an oversight.
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Tue, Jul 31, 2012 at 6:50 PM, Fujii Masao <masao.fujii@gmail.com> wrote:
On Wed, Aug 1, 2012 at 12:09 AM, Alvaro Herrera
<alvherre@commandprompt.com> wrote:Excerpts from Fujii Masao's message of mar jul 17 13:58:38 -0400 2012:
You're right. If the error is detected, that function always returns false
and the error message is emitted (but I think that current error message
"pg_basebackup: child process exited with error 1" is confusing....),
so it's OK. But if walsender in the server is terminated by SIGTERM,
no error is detected and pg_basebackup background process gets out
of the loop in ReceiveXlogStream() and returns true.Oh. Because the server does a graceful shutdown. D'uh, of course.
Then yes, your suggested fix seems like a good one.
Attached patch adds the fix.
Also I found I had forgotten to set the file descriptor to -1 at the end of
ReceiveXlogStream(), in previously-committed my patch. Attached patch
fixes this problem.This hasn't been committed yet AFAICT, and it probably needs a refresh
now after my changes to pg_basebackup. Please update the patch.I attached the updated version.
Thanks, applied.
Also,
if this is not in the Open Items list, please put it there so that we
don't forget it before the 9.2 release.Yep, done.
And I'll go take it off :-)
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/