mdclose() does not cope w/ FileClose() failure
Forking thread "WAL logging problem in 9.4.3?" for this tangent:
On Mon, Dec 09, 2019 at 06:04:06PM +0900, Kyotaro Horiguchi wrote:
I don't understand why mdclose checks for (v->mdfd_vfd >= 0) of open
segment but anyway mdimmedsync is believing that that won't happen and
I follow the assumption. (I suspect that the if condition in mdclose
should be an assertion..)
That check helps when data_sync_retry=on and FileClose() raised an error in a
previous mdclose() invocation. However, the check is not sufficient to make
that case work; the attached test case (not for commit) gets an assertion
failure or SIGSEGV.
I am inclined to fix this by decrementing md_num_open_segs before modifying
md_seg_fds (second attachment). An alternative would be to call
_fdvec_resize() after every FileClose(), like mdtruncate() does; however, the
repalloc() overhead could be noticeable. (mdclose() is called much more
frequently than mdtruncate().)
Incidentally, _mdfd_openseg() has this:
if (segno <= reln->md_num_open_segs[forknum])
_fdvec_resize(reln, forknum, segno + 1);
That should be >=, not <=. If the less-than case happened, this would delete
the record of a vfd for a higher-numbered segno. There's no live bug, because
only segno == reln->md_num_open_segs[forknum] actually happens. I am inclined
to make an assertion of that and remove the condition:
Assert(segno == reln->md_num_open_segs[forknum]);
_fdvec_resize(reln, forknum, segno + 1);
Attachments:
FileClose-test-v0.patchtext/plain; charset=us-asciiDownload
diff --git a/configure b/configure
index 9de5037..a1f09e8 100755
--- a/configure
+++ b/configure
@@ -3725,7 +3725,7 @@ $as_echo "${segsize}GB" >&6; }
cat >>confdefs.h <<_ACEOF
-#define RELSEG_SIZE ${RELSEG_SIZE}
+#define RELSEG_SIZE 2
_ACEOF
diff --git a/configure.in b/configure.in
index 9c5e5e7..86f3604 100644
--- a/configure.in
+++ b/configure.in
@@ -289,7 +289,7 @@ RELSEG_SIZE=`expr '(' 1024 / ${blocksize} ')' '*' ${segsize} '*' 1024`
test $? -eq 0 || exit 1
AC_MSG_RESULT([${segsize}GB])
-AC_DEFINE_UNQUOTED([RELSEG_SIZE], ${RELSEG_SIZE}, [
+AC_DEFINE_UNQUOTED([RELSEG_SIZE], 2, [
RELSEG_SIZE is the maximum number of blocks allowed in one disk file.
Thus, the maximum size of a single file is RELSEG_SIZE * BLCKSZ;
relations bigger than that are divided into multiple files.
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index f4ab19f..350ce7f 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1741,6 +1741,8 @@ PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
return true;
}
+char *fail_to_close_file;
+
/*
* close a file when done with it
*/
@@ -1751,8 +1753,11 @@ FileClose(File file)
Assert(FileIsValid(file));
- DO_DB(elog(LOG, "FileClose: %d (%s)",
- file, VfdCache[file].fileName));
+ elog(LOG, "FileClose: %d (%s)",
+ file, VfdCache[file].fileName);
+
+ if (strcmp(VfdCache[file].fileName, fail_to_close_file) == 0)
+ elog(ERROR, "failing to close %s, as requested", fail_to_close_file);
vfdP = &VfdCache[file];
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8d951ce..c9f5f53 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3504,6 +3504,16 @@ static struct config_real ConfigureNamesReal[] =
static struct config_string ConfigureNamesString[] =
{
{
+ {"fail_to_close_file", PGC_SUSET, DEVELOPER_OPTIONS,
+ gettext_noop("Raise an error when attempt to close this file."),
+ gettext_noop("This is a temporary GUC to demonstrate a bug."),
+ GUC_NOT_IN_SAMPLE
+ },
+ &fail_to_close_file,
+ "",
+ NULL, NULL, NULL
+ },
+ {
{"archive_command", PGC_SIGHUP, WAL_ARCHIVING,
gettext_noop("Sets the shell command that will be called to archive a WAL file."),
NULL
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 625fbc3..8c21d76 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -48,6 +48,7 @@ typedef int File;
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
extern PGDLLIMPORT bool data_sync_retry;
+extern PGDLLIMPORT char *fail_to_close_file;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/test/regress/sql/boolean.sql b/src/test/regress/sql/boolean.sql
index df61fa4..50a4550 100644
--- a/src/test/regress/sql/boolean.sql
+++ b/src/test/regress/sql/boolean.sql
@@ -2,6 +2,22 @@
-- BOOLEAN
--
+-- Row count must be large enough to create more than one segment. This value
+-- is enough at RELSEG_SIZE=2. That is unrealistic for production use, but it
+-- makes the test finish quickly, with low disk consumption.
+create table fileclose (c) as select * from generate_series(1, 2000);
+
+-- This closes segno==1, then fails the attempt to close segno==0. This
+-- leaves reln->md_num_open_segs[MAIN_FORKNUM]==2 despite
+-- reln->md_seg_fds[MAIN_FORKNUM][1]==-1; md.c can't cope with that.
+begin;
+select set_config('fail_to_close_file', pg_relation_filepath('fileclose'), true);
+analyze fileclose;
+commit;
+
+analyze fileclose; -- TRAP: FailedAssertion("FileIsValid(file)", File: "fd.c", Line: 2039)
+drop table fileclose;
+
--
-- sanity check - if this fails go insane!
--
mdclose-FileClose-v1.patchtext/plain; charset=us-asciiDownload
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 82442db..02983cd 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -516,12 +516,9 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
- /* if not closed already */
- if (v->mdfd_vfd >= 0)
- {
- FileClose(v->mdfd_vfd);
- v->mdfd_vfd = -1;
- }
+ FileClose(v->mdfd_vfd);
+ reln->md_num_open_segs[forknum] = nopensegs - 1;
+ v->mdfd_vfd = -1;
nopensegs--;
}
On Sun, Dec 22, 2019 at 01:19:30AM -0800, Noah Misch wrote:
I am inclined to fix this by decrementing md_num_open_segs before modifying
md_seg_fds (second attachment).
That leaked memory, since _fdvec_resize() assumes md_num_open_segs is also the
allocated array length. The alternative is looking better:
An alternative would be to call
_fdvec_resize() after every FileClose(), like mdtruncate() does; however, the
repalloc() overhead could be noticeable. (mdclose() is called much more
frequently than mdtruncate().)
I can skip repalloc() when the array length decreases, to assuage mdclose()'s
worry. In the mdclose() case, the final _fdvec_resize(reln, fork, 0) will
still pfree() the array. Array elements that mdtruncate() frees today will
instead persist to end of transaction. That is okay, since mdtruncate()
crossing more than one segment boundary is fairly infrequent. For it to
happen, you must either create a >2G relation and then TRUNCATE it in the same
transaction, or VACUUM must find >1-2G of unused space at the end of the
relation. I'm now inclined to do it that way, attached.
Attachments:
mdclose-FileClose-v2.patchtext/plain; charset=us-asciiDownload
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 82442db..5dae6d4 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -516,18 +516,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
- /* if not closed already */
- if (v->mdfd_vfd >= 0)
- {
- FileClose(v->mdfd_vfd);
- v->mdfd_vfd = -1;
- }
-
+ FileClose(v->mdfd_vfd);
+ _fdvec_resize(reln, forknum, nopensegs - 1);
nopensegs--;
}
-
- /* resize just once, avoids pointless reallocations */
- _fdvec_resize(reln, forknum, 0);
}
/*
@@ -1047,13 +1039,15 @@ _fdvec_resize(SMgrRelation reln,
reln->md_seg_fds[forknum] =
MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
}
- else
+ else if (nseg > reln->md_num_open_segs[forknum])
{
/*
* It doesn't seem worthwhile complicating the code by having a more
* aggressive growth strategy here; the number of segments doesn't
* grow that fast, and the memory context internally will sometimes
- * avoid doing an actual reallocation.
+ * avoid doing an actual reallocation. Likewise, since the number of
+ * segments doesn't shrink that fast, don't shrink at all. During
+ * mdclose(), we'll pfree the array at nseg==0.
*/
reln->md_seg_fds[forknum] =
repalloc(reln->md_seg_fds[forknum],
On Sun, Dec 22, 2019 at 10:19 PM Noah Misch <noah@leadboat.com> wrote:
Assert(segno == reln->md_num_open_segs[forknum]);
_fdvec_resize(reln, forknum, segno + 1);
Oh yeah, I spotted that part too but didn't follow up.
/messages/by-id/CA+hUKG+NBw+uSzxF1os-SO6gUuw=cqO5DAybk6KnHKzgGvxhxA@mail.gmail.com
On Mon, Dec 23, 2019 at 09:33:29AM +1300, Thomas Munro wrote:
On Sun, Dec 22, 2019 at 10:19 PM Noah Misch <noah@leadboat.com> wrote:
Assert(segno == reln->md_num_open_segs[forknum]);
_fdvec_resize(reln, forknum, segno + 1);Oh yeah, I spotted that part too but didn't follow up.
/messages/by-id/CA+hUKG+NBw+uSzxF1os-SO6gUuw=cqO5DAybk6KnHKzgGvxhxA@mail.gmail.com
That patch of yours looks good.
Hello.
At Sun, 22 Dec 2019 12:21:00 -0800, Noah Misch <noah@leadboat.com> wrote in
On Sun, Dec 22, 2019 at 01:19:30AM -0800, Noah Misch wrote:
I am inclined to fix this by decrementing md_num_open_segs before modifying
md_seg_fds (second attachment).That leaked memory, since _fdvec_resize() assumes md_num_open_segs is also the
allocated array length. The alternative is looking better:
I agree that v2 is cleaner in the light of modularity and fixes the
memory leak happens at re-open.
An alternative would be to call
_fdvec_resize() after every FileClose(), like mdtruncate() does; however, the
repalloc() overhead could be noticeable. (mdclose() is called much more
frequently than mdtruncate().)I can skip repalloc() when the array length decreases, to assuage mdclose()'s
worry. In the mdclose() case, the final _fdvec_resize(reln, fork, 0) will
still pfree() the array. Array elements that mdtruncate() frees today will
instead persist to end of transaction. That is okay, since mdtruncate()
crossing more than one segment boundary is fairly infrequent. For it to
happen, you must either create a >2G relation and then TRUNCATE it in the same
transaction, or VACUUM must find >1-2G of unused space at the end of the
relation. I'm now inclined to do it that way, attached.
* It doesn't seem worthwhile complicating the code by having a more
* aggressive growth strategy here; the number of segments doesn't
* grow that fast, and the memory context internally will sometimes
- * avoid doing an actual reallocation.
+ * avoid doing an actual reallocation. Likewise, since the number of
+ * segments doesn't shrink that fast, don't shrink at all. During
+ * mdclose(), we'll pfree the array at nseg==0.
If I understand it correctly, it is mentioning the number of the all
segment files in a fork, not the length of md_seg_fds arrays at a
certain moment. But actually _fdvec_resize is called for every segment
opening during mdnblocks (just-after mdopen), and every segment
closing during mdclose and mdtruncate as mentioned here. We are going
to omit pallocs only in the decreasing case.
If we regard repalloc as far faster than FileOpen/FileClose or we care
about only increase of segment number of mdopen'ed files and don't
care the frequent resize that happens during the functions above, then
the comment is right and we may resize the array in the
segment-by-segment manner.
But if they are comparable each other, or we don't want the array gets
resized frequently, we might need to prevent repalloc from happening
on every segment increase, too.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Mon, Dec 23, 2019 at 07:41:49PM +0900, Kyotaro Horiguchi wrote:
At Sun, 22 Dec 2019 12:21:00 -0800, Noah Misch <noah@leadboat.com> wrote in
On Sun, Dec 22, 2019 at 01:19:30AM -0800, Noah Misch wrote:
An alternative would be to call
_fdvec_resize() after every FileClose(), like mdtruncate() does; however, the
repalloc() overhead could be noticeable. (mdclose() is called much more
frequently than mdtruncate().)I can skip repalloc() when the array length decreases, to assuage mdclose()'s
worry. In the mdclose() case, the final _fdvec_resize(reln, fork, 0) will
still pfree() the array. Array elements that mdtruncate() frees today will
instead persist to end of transaction. That is okay, since mdtruncate()
crossing more than one segment boundary is fairly infrequent. For it to
happen, you must either create a >2G relation and then TRUNCATE it in the same
transaction, or VACUUM must find >1-2G of unused space at the end of the
relation. I'm now inclined to do it that way, attached.* It doesn't seem worthwhile complicating the code by having a more * aggressive growth strategy here; the number of segments doesn't * grow that fast, and the memory context internally will sometimes - * avoid doing an actual reallocation. + * avoid doing an actual reallocation. Likewise, since the number of + * segments doesn't shrink that fast, don't shrink at all. During + * mdclose(), we'll pfree the array at nseg==0.If I understand it correctly, it is mentioning the number of the all
segment files in a fork, not the length of md_seg_fds arrays at a
certain moment. But actually _fdvec_resize is called for every segment
opening during mdnblocks (just-after mdopen), and every segment
closing during mdclose and mdtruncate as mentioned here. We are going
to omit pallocs only in the decreasing case.
That is a good point. How frequently one adds 1 GiB of data is not the main
issue. mdclose() and subsequent re-opening of all segments will be more
relevant to overall performance.
If we regard repalloc as far faster than FileOpen/FileClose or we care
about only increase of segment number of mdopen'ed files and don't
care the frequent resize that happens during the functions above, then
the comment is right and we may resize the array in the
segment-by-segment manner.
In most cases, the array will fit into a power-of-two chunk, so repalloc()
already does the right thing. Once the table has more than ~1000 segments (~1
TiB table size), the allocation will get a single-chunk block, and every
subsequent repalloc() will call realloc(). Even then, repalloc() probably is
far faster than File operations. Likely, I should just accept the extra
repalloc() calls and drop the "else if" change in _fdvec_resize().
At Tue, 24 Dec 2019 11:57:39 -0800, Noah Misch <noah@leadboat.com> wrote in
On Mon, Dec 23, 2019 at 07:41:49PM +0900, Kyotaro Horiguchi wrote:
If I understand it correctly, it is mentioning the number of the all
segment files in a fork, not the length of md_seg_fds arrays at a
certain moment. But actually _fdvec_resize is called for every segment
opening during mdnblocks (just-after mdopen), and every segment
closing during mdclose and mdtruncate as mentioned here. We are going
to omit pallocs only in the decreasing case.That is a good point. How frequently one adds 1 GiB of data is not the main
issue. mdclose() and subsequent re-opening of all segments will be more
relevant to overall performance.
Yes, that's exactly what I meant.
If we regard repalloc as far faster than FileOpen/FileClose or we care
about only increase of segment number of mdopen'ed files and don't
care the frequent resize that happens during the functions above, then
the comment is right and we may resize the array in the
segment-by-segment manner.In most cases, the array will fit into a power-of-two chunk, so repalloc()
already does the right thing. Once the table has more than ~1000 segments (~1
TiB table size), the allocation will get a single-chunk block, and every
subsequent repalloc() will call realloc(). Even then, repalloc() probably is
far faster than File operations. Likely, I should just accept the extra
repalloc() calls and drop the "else if" change in _fdvec_resize().
I'm not sure which is better. If we say we know that
repalloc(AllocSetRealloc) doesn't free memory at all, there's no point
in calling repalloc for shrinking and we could omit that under the
name of optimization. If we say we want to free memory as much as
possible, we should call repalloc pretending to believe that that
happens.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On Wed, Dec 25, 2019 at 10:39:32AM +0900, Kyotaro Horiguchi wrote:
At Tue, 24 Dec 2019 11:57:39 -0800, Noah Misch <noah@leadboat.com> wrote in
On Mon, Dec 23, 2019 at 07:41:49PM +0900, Kyotaro Horiguchi wrote:
If we regard repalloc as far faster than FileOpen/FileClose or we care
about only increase of segment number of mdopen'ed files and don't
care the frequent resize that happens during the functions above, then
the comment is right and we may resize the array in the
segment-by-segment manner.In most cases, the array will fit into a power-of-two chunk, so repalloc()
already does the right thing. Once the table has more than ~1000 segments (~1
TiB table size), the allocation will get a single-chunk block, and every
subsequent repalloc() will call realloc(). Even then, repalloc() probably is
far faster than File operations. Likely, I should just accept the extra
repalloc() calls and drop the "else if" change in _fdvec_resize().I'm not sure which is better. If we say we know that
repalloc(AllocSetRealloc) doesn't free memory at all, there's no point
in calling repalloc for shrinking and we could omit that under the
name of optimization. If we say we want to free memory as much as
possible, we should call repalloc pretending to believe that that
happens.
As long as we free the memory by the end of mdclose(), I think it doesn't
matter whether we freed memory in the middle of mdclose().
I ran a crude benchmark that found PathNameOpenFile()+FileClose() costing at
least two hundred times as much as the repalloc() pair. Hence, I now plan not
to avoid repalloc(), as attached. Crude benchmark code:
#define NSEG 9000
for (i = 0; i < count1; i++)
{
int j;
for (j = 0; j < NSEG; ++j)
{
File f = PathNameOpenFile("/etc/services", O_RDONLY);
if (f < 0)
elog(ERROR, "fail open: %m");
FileClose(f);
}
}
for (i = 0; i < count2; i++)
{
int j;
void *buf = palloc(1);
for (j = 2; j < NSEG; ++j)
buf = repalloc(buf, j * 8);
while (--j > 0)
buf = repalloc(buf, j * 8);
}
Attachments:
mdclose-FileClose-v3.patchtext/plain; charset=us-asciiDownload
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 82442db..e768f2c 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -516,18 +516,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
- /* if not closed already */
- if (v->mdfd_vfd >= 0)
- {
- FileClose(v->mdfd_vfd);
- v->mdfd_vfd = -1;
- }
-
+ FileClose(v->mdfd_vfd);
+ _fdvec_resize(reln, forknum, nopensegs - 1);
nopensegs--;
}
-
- /* resize just once, avoids pointless reallocations */
- _fdvec_resize(reln, forknum, 0);
}
/*
@@ -1050,10 +1042,10 @@ _fdvec_resize(SMgrRelation reln,
else
{
/*
- * It doesn't seem worthwhile complicating the code by having a more
- * aggressive growth strategy here; the number of segments doesn't
- * grow that fast, and the memory context internally will sometimes
- * avoid doing an actual reallocation.
+ * It doesn't seem worthwhile complicating the code to amortize
+ * repalloc() calls. Those are far faster than PathNameOpenFile() or
+ * FileClose(), and the memory context internally will sometimes avoid
+ * doing an actual reallocation.
*/
reln->md_seg_fds[forknum] =
repalloc(reln->md_seg_fds[forknum],
At Wed, 1 Jan 2020 23:46:02 -0800, Noah Misch <noah@leadboat.com> wrote in
On Wed, Dec 25, 2019 at 10:39:32AM +0900, Kyotaro Horiguchi wrote:
I'm not sure which is better. If we say we know that
repalloc(AllocSetRealloc) doesn't free memory at all, there's no point
in calling repalloc for shrinking and we could omit that under the
name of optimization. If we say we want to free memory as much as
possible, we should call repalloc pretending to believe that that
happens.As long as we free the memory by the end of mdclose(), I think it doesn't
matter whether we freed memory in the middle of mdclose().
Agreed.
I ran a crude benchmark that found PathNameOpenFile()+FileClose() costing at
least two hundred times as much as the repalloc() pair. Hence, I now plan not
to avoid repalloc(), as attached. Crude benchmark code:
I got about 25 times difference with -O0 and about 50 times with -O2.
(xfs / CentOS8) It's smaller than I intuitively expected but perhaps
50 times difference is large enough.
The patch looks good to me.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center