Bug in batch tuplesort memory CLUSTER case (9.6 only)

Started by Peter Geogheganover 9 years ago26 messages
#1Peter Geoghegan
pg@heroku.com
1 attachment(s)

In general, moving tuplesort.c batch memory caller tuples around
happens when batch memory needs to be recycled, or freed outright with
pfree().

I failed to take into account that CLUSTER tuplesorts need an extra
step when moving caller tuples to a new location (i.e. when moving
HeapTuple caller tuples using memmove()), because their particular
variety of caller tuple happens to itself contain a pointer to
palloc()'d memory. Attached patch fixes this use-after-free bug.

--
Peter Geoghegan

Attachments:

0001-Fix-bug-in-batch-tuplesort-memory-with-CLUSTER.patchtext/x-patch; charset=US-ASCII; name=0001-Fix-bug-in-batch-tuplesort-memory-with-CLUSTER.patchDownload
From aa68ed9e3f952bc19aa3bc8e31d0e216e13e0fae Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 26 Jun 2016 16:25:07 -0700
Subject: [PATCH] Fix bug in batch tuplesort memory with CLUSTER

Commit 0011c0091 optimized tuplesort.c's use of memory during external
sort final on-the-fly merge steps.  This included handling exhaustion of
large batch memory allocations by moving existing caller tuples to a
new, usable location using memmove().  This failed to take into account
one particular special case, though:  CLUSTER tuplesorts.

The CLUSTER case involves caller tuples that themselves contain a
pointer to an offset into the selfsame caller tuple.  A use-after-free
could therefore happen when the CLUSTER tuplesort client dereferenced
this contained pointer.

Instead of calling memmove() directly, affected codepaths now call a
generic MOVETUP() interface routine, reaching a caller-tuple specific
implementation.  The CLUSTER implementation performs a memmove() and
handles this extra detail.  In all other cases, a memmove() shim
implementation is reached.
---
 src/backend/utils/sort/tuplesort.c | 56 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 7878660..4c502bb 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -300,11 +300,21 @@ struct Tuplesortstate
 	 * the already-read length of the stored tuple.  Create a palloc'd copy,
 	 * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
 	 * decrease state->availMem by the amount of memory space consumed.
+	 * (See batchUsed notes for details on how memory is handled when
+	 * incremental accounting is abandoned.)
 	 */
 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
 										int tapenum, unsigned int len);
 
 	/*
+	 * Function to move a caller tuple.  This is usually implemented as a
+	 * memmove() shim, but function may also perform additional fix-up of
+	 * caller tuple where needed.  Batch memory support requires the
+	 * movement of caller tuples from one location in memory to another.
+	 */
+	void		(*movetup) (void *dest, void *src, unsigned int len);
+
+	/*
 	 * This array holds the tuples now in sort memory.  If we are in state
 	 * INITIAL, the tuples are in no particular order; if we are in state
 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
@@ -475,6 +485,7 @@ struct Tuplesortstate
 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
 #define WRITETUP(state,tape,stup)	((*(state)->writetup) (state, tape, stup))
 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
+#define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
 #define LACKMEM(state)		((state)->availMem < 0 && !(state)->batchUsed)
 #define USEMEM(state,amt)	((state)->availMem -= (amt))
 #define FREEMEM(state,amt)	((state)->availMem += (amt))
@@ -542,7 +553,7 @@ static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
 static void mergeonerun(Tuplesortstate *state);
-static void beginmerge(Tuplesortstate *state, bool finalMerge);
+static void beginmerge(Tuplesortstate *state, bool finalMergeBatch);
 static void batchmemtuples(Tuplesortstate *state);
 static void mergebatch(Tuplesortstate *state, int64 spacePerTape);
 static void mergebatchone(Tuplesortstate *state, int srcTape,
@@ -571,6 +582,7 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
 			  SortTuple *stup);
 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
 			 int tapenum, unsigned int len);
+static void movetup_heap(void *dest, void *src, unsigned int len);
 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
 				   Tuplesortstate *state);
 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -578,6 +590,7 @@ static void writetup_cluster(Tuplesortstate *state, int tapenum,
 				 SortTuple *stup);
 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 				int tapenum, unsigned int len);
+static void movetup_cluster(void *dest, void *src, unsigned int len);
 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 					   Tuplesortstate *state);
 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
@@ -587,6 +600,7 @@ static void writetup_index(Tuplesortstate *state, int tapenum,
 			   SortTuple *stup);
 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
 			  int tapenum, unsigned int len);
+static void movetup_index(void *dest, void *src, unsigned int len);
 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
 				 Tuplesortstate *state);
 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -594,6 +608,7 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
 			   SortTuple *stup);
 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
 			  int tapenum, unsigned int len);
+static void movetup_datum(void *dest, void *src, unsigned int len);
 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
 
 /*
@@ -749,6 +764,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 	state->copytup = copytup_heap;
 	state->writetup = writetup_heap;
 	state->readtup = readtup_heap;
+	state->movetup = movetup_heap;
 
 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
 	state->abbrevNext = 10;
@@ -821,6 +837,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 	state->copytup = copytup_cluster;
 	state->writetup = writetup_cluster;
 	state->readtup = readtup_cluster;
+	state->movetup = movetup_cluster;
 	state->abbrevNext = 10;
 
 	state->indexInfo = BuildIndexInfo(indexRel);
@@ -912,6 +929,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	state->copytup = copytup_index;
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
+	state->movetup = movetup_index;
 	state->abbrevNext = 10;
 
 	state->heapRel = heapRel;
@@ -979,6 +997,7 @@ tuplesort_begin_index_hash(Relation heapRel,
 	state->copytup = copytup_index;
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
+	state->movetup = movetup_index;
 
 	state->heapRel = heapRel;
 	state->indexRel = indexRel;
@@ -1021,6 +1040,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 	state->copytup = copytup_datum;
 	state->writetup = writetup_datum;
 	state->readtup = readtup_datum;
+	state->movetup = movetup_datum;
 	state->abbrevNext = 10;
 
 	state->datumType = datumType;
@@ -2975,7 +2995,7 @@ mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup,
 		 */
 		tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
 		state->mergecurrent[srcTape] = state->mergetuples[srcTape];
-		memmove(state->mergecurrent[srcTape], state->mergetail[srcTape],
+		MOVETUP(state->mergecurrent[srcTape], state->mergetail[srcTape],
 				tupLen);
 
 		/* Make SortTuple at top of the merge heap point to new tuple */
@@ -3039,7 +3059,7 @@ mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup,
 
 		tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
 		rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen);
-		memcpy(rtup->tuple, oldTuple, tuplen);
+		MOVETUP(rtup->tuple, oldTuple, tuplen);
 		*should_free = true;
 	}
 
@@ -4061,6 +4081,12 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
 								&stup->isnull1);
 }
 
+static void
+movetup_heap(void *dest, void *src, unsigned int len)
+{
+	memmove(dest, src, len);
+}
+
 /*
  * Routines specialized for the CLUSTER case (HeapTuple data, with
  * comparisons per a btree index definition)
@@ -4302,6 +4328,18 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup,
 									&stup->isnull1);
 }
 
+static void
+movetup_cluster(void *dest, void *src, unsigned int len)
+{
+	HeapTuple	tuple;
+
+	memmove(dest, src, len);
+
+	/* Repoint the HeapTupleData header */
+	tuple = (HeapTuple) dest;
+	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
+}
+
 
 /*
  * Routines specialized for IndexTuple case
@@ -4594,6 +4632,12 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
 								 &stup->isnull1);
 }
 
+static void
+movetup_index(void *dest, void *src, unsigned int len)
+{
+	memmove(dest, src, len);
+}
+
 /*
  * Routines specialized for DatumTuple case
  */
@@ -4704,6 +4748,12 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 							 &tuplen, sizeof(tuplen));
 }
 
+static void
+movetup_datum(void *dest, void *src, unsigned int len)
+{
+	memmove(dest, src, len);
+}
+
 /*
  * Convenience routine to free a tuple previously loaded into sort memory
  */
-- 
2.7.4

#2Noah Misch
noah@leadboat.com
In reply to: Peter Geoghegan (#1)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Sun, Jun 26, 2016 at 09:14:05PM -0700, Peter Geoghegan wrote:

In general, moving tuplesort.c batch memory caller tuples around
happens when batch memory needs to be recycled, or freed outright with
pfree().

I failed to take into account that CLUSTER tuplesorts need an extra
step when moving caller tuples to a new location (i.e. when moving
HeapTuple caller tuples using memmove()), because their particular
variety of caller tuple happens to itself contain a pointer to
palloc()'d memory. Attached patch fixes this use-after-free bug.

[Action required within 72 hours. This is a generic notification.]

The above-described topic is currently a PostgreSQL 9.6 open item. Robert,
since you committed the patch believed to have created it, you own this open
item. If some other commit is more relevant or if this does not belong as a
9.6 open item, please let us know. Otherwise, please observe the policy on
open item ownership[1]/messages/by-id/20160527025039.GA447393@tornado.leadboat.com and send a status update within 72 hours of this
message. Include a date for your subsequent status update. Testers may
discover new open items at any time, and I want to plan to get them all fixed
well in advance of shipping 9.6rc1. Consequently, I will appreciate your
efforts toward speedy resolution. Thanks.

[1]: /messages/by-id/20160527025039.GA447393@tornado.leadboat.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Robert Haas
robertmhaas@gmail.com
In reply to: Noah Misch (#2)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 12:06 AM, Noah Misch <noah@leadboat.com> wrote:

On Sun, Jun 26, 2016 at 09:14:05PM -0700, Peter Geoghegan wrote:

In general, moving tuplesort.c batch memory caller tuples around
happens when batch memory needs to be recycled, or freed outright with
pfree().

I failed to take into account that CLUSTER tuplesorts need an extra
step when moving caller tuples to a new location (i.e. when moving
HeapTuple caller tuples using memmove()), because their particular
variety of caller tuple happens to itself contain a pointer to
palloc()'d memory. Attached patch fixes this use-after-free bug.

[Action required within 72 hours. This is a generic notification.]

The above-described topic is currently a PostgreSQL 9.6 open item. Robert,
since you committed the patch believed to have created it, you own this open
item. If some other commit is more relevant or if this does not belong as a
9.6 open item, please let us know. Otherwise, please observe the policy on
open item ownership[1] and send a status update within 72 hours of this
message. Include a date for your subsequent status update. Testers may
discover new open items at any time, and I want to plan to get them all fixed
well in advance of shipping 9.6rc1. Consequently, I will appreciate your
efforts toward speedy resolution. Thanks.

[1] /messages/by-id/20160527025039.GA447393@tornado.leadboat.com

The proposed patch contains no test case and no description of how to
reproduce the problem. I am not very keen on the idea of trying to
puzzle that out from first principles. Also, I would appreciate a
clearer explanation of why this only affects CLUSTER tuplesorts.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#4Peter Geoghegan
pg@heroku.com
In reply to: Robert Haas (#3)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 6:23 AM, Robert Haas <robertmhaas@gmail.com> wrote:

The proposed patch contains no test case and no description of how to
reproduce the problem. I am not very keen on the idea of trying to
puzzle that out from first principles.

I thought that the bug was simple enough that it didn't require a
testcase. Besides, as I've often complained about there are no tests
of external sorting in the regression test suite whatsoever. I don't
think you'd just accept it now if I tried to add some.

I could give you steps to reproduce the bug, but they involve creating
a large table using my gensort tool [1]https://github.com/petergeoghegan/gensort -- Peter Geoghegan. It isn't trivial. Are you
interested?

Also, I would appreciate a
clearer explanation of why this only affects CLUSTER tuplesorts.

As I said, it has the only type of caller tuple that happens to itself
contain a pointer to palloc()'d memory. Clearly that needs to be
updated if the tuple is moved, since it always points to an offset
into the same tuple. I thought that that explanation was simple.

[1]: https://github.com/petergeoghegan/gensort -- Peter Geoghegan
--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Robert Haas
robertmhaas@gmail.com
In reply to: Peter Geoghegan (#4)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 2:23 PM, Peter Geoghegan <pg@heroku.com> wrote:

On Fri, Jul 1, 2016 at 6:23 AM, Robert Haas <robertmhaas@gmail.com> wrote:

The proposed patch contains no test case and no description of how to
reproduce the problem. I am not very keen on the idea of trying to
puzzle that out from first principles.

I thought that the bug was simple enough that it didn't require a
testcase. Besides, as I've often complained about there are no tests
of external sorting in the regression test suite whatsoever. I don't
think you'd just accept it now if I tried to add some.

I could give you steps to reproduce the bug, but they involve creating
a large table using my gensort tool [1]. It isn't trivial. Are you
interested?

The bug can't very well be so simple that you need not include a set
of steps to reproduce it and, at the same time, so complex that even
so much as reading the list of steps to reproduce it might be more
than I want to do.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6Peter Geoghegan
pg@heroku.com
In reply to: Robert Haas (#5)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 12:10 PM, Robert Haas <robertmhaas@gmail.com> wrote:

I could give you steps to reproduce the bug, but they involve creating
a large table using my gensort tool [1]. It isn't trivial. Are you
interested?

The bug can't very well be so simple that you need not include a set
of steps to reproduce it and, at the same time, so complex that even
so much as reading the list of steps to reproduce it might be more
than I want to do.

Reading and following are two different things. I don't think reading
alone will do much good with the following steps, but here they are:

Checkout my gensort tool from github. Build the C tool with "make".
Then, from the working directory:

./postgres_load.py -m 250 --skew --logged
psql -c "CREATE INDEX segfaults on sort_test_skew(sortkey);"
psql -c "CLUSTER sort_test_skew USING segfaults;"

That test case isn't at all minimal, but that's how I happened upon
the bug. I didn't tell you this before now because I assumed that
you'd just accept that there was an omission made based on a quick
reading of the code. This is not a complicated bug; the pointer
HeapTuple.t_data needs to be updated when tuples are moved around in
memory, but isn't. readtup_cluster() initializes that field like this:

/* Reconstruct the HeapTupleData header */
tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);

More or less the same process needs to occur following any movement of
the tuple. Whereas, in all other cases there is no need to do
something similar, as it happens.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Peter Geoghegan
pg@heroku.com
In reply to: Peter Geoghegan (#6)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 12:30 PM, Peter Geoghegan <pg@heroku.com> wrote:

Checkout my gensort tool from github. Build the C tool with "make".
Then, from the working directory:

./postgres_load.py -m 250 --skew --logged
psql -c "CREATE INDEX segfaults on sort_test_skew(sortkey);"
psql -c "CLUSTER sort_test_skew USING segfaults;"

I think that almost any CLUSTER based on an external sort would show
problems with valgrind memcheck, though. That's probably far easier.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8Noah Misch
noah@leadboat.com
In reply to: Peter Geoghegan (#7)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 01, 2016 at 12:37:23PM -0700, Peter Geoghegan wrote:

On Fri, Jul 1, 2016 at 12:30 PM, Peter Geoghegan <pg@heroku.com> wrote:

Checkout my gensort tool from github. Build the C tool with "make".
Then, from the working directory:

./postgres_load.py -m 250 --skew --logged
psql -c "CREATE INDEX segfaults on sort_test_skew(sortkey);"
psql -c "CLUSTER sort_test_skew USING segfaults;"

I think that almost any CLUSTER based on an external sort would show
problems with valgrind memcheck, though. That's probably far easier.

I think such a test would suffice to cover this bug if Valgrind Memcheck does
detect the problem in it.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Peter Geoghegan
pg@heroku.com
In reply to: Noah Misch (#8)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 7:05 PM, Noah Misch <noah@leadboat.com> wrote:

I think such a test would suffice to cover this bug if Valgrind Memcheck does
detect the problem in it.

Are you asking me to produce a regression test that exercises the bug?

I would be happy to do so, but I require some clarification on project
policy first. As I already mentioned, we currently have *precisely*
zero test coverage of external sorting. Apparently, avoiding external
sort operations in the regression tests has value. I attempted to
address this with amcheck, which had extensive tests for B-Tree index
builds in its standard regression tests (including coverage of
collatable types; testing using amcheck conveniently made the tests
portable). The theory there was that as a contrib module, amcheck's
regression tests could be run less frequently by hackers, possibly
very infrequently by having a special test_decoding style target.

Unfortunately, no committer took an interest in amcheck at the time,
so that has yet to go anywhere. If things had gone differently there,
it would be pretty straightforward to add a few CLUSTER commands to
those tests. Still, it probably makes sense to have a dedicated
tuplesort regression test suite, that is (through whatever mechanism)
only run selectively on certain buildfarm animals that don't have slow
I/O subsystems. The tests must also be easily avoidable by hackers
when running tests on their development machines.

I think that a new tuplesort.sql test file is where a test like this
belongs (not in the existing cluster.sql test file). If I write a
patch along those lines, is it ever going to be accepted? I would be
happy to work on this, but we need to have a sensible conversation on
what's acceptable to everyone in this area first. I've screamed bloody
murder about the test coverage in this area before, and nothing
happened.

If you think I'm overstating things, then consider how many commits
that touched tuplesort.c added any tests. Even the commit "Fix
inadequately-tested code path in tuplesort_skiptuples()." didn't add
tests! There is scarcely any example of anyone deliberately adding
test coverage to that file. I don't understand why it is so.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10Noah Misch
noah@leadboat.com
In reply to: Peter Geoghegan (#9)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 01, 2016 at 08:09:07PM -0700, Peter Geoghegan wrote:

On Fri, Jul 1, 2016 at 7:05 PM, Noah Misch <noah@leadboat.com> wrote:

I think such a test would suffice to cover this bug if Valgrind Memcheck does
detect the problem in it.

Are you asking me to produce a regression test that exercises the bug?

Yes, please produce a patch version bearing a regression test that exercises
the bug. I feel it counts even if the you need Valgrind Memcheck to see that
it exercises the bug, but I won't interfere if Robert disagrees. The test
should take moderate steps to minimize costs. For example, it should decrease
maintenance_work_mem, not sort >16 MiB of data.

I would be happy to do so, but I require some clarification on project
policy first. As I already mentioned, we currently have *precisely*
zero test coverage of external sorting. Apparently, avoiding external
sort operations in the regression tests has value.

PostgreSQL is open to moving features from zero test coverage to nonzero test
coverage. The last several releases have each done so. Does that
sufficiently clarify the policy landscape?

I think that a new tuplesort.sql test file is where a test like this
belongs (not in the existing cluster.sql test file). If I write a
patch along those lines, is it ever going to be accepted? I would be
happy to work on this, but we need to have a sensible conversation on
what's acceptable to everyone in this area first. I've screamed bloody
murder about the test coverage in this area before, and nothing
happened.

I'll guess that Robert would prefer a minimal test in cluster.sql over
starting a new file. If you want to be sure, wait for his input.

If you think I'm overstating things, then consider how many commits
that touched tuplesort.c added any tests. Even the commit "Fix
inadequately-tested code path in tuplesort_skiptuples()." didn't add
tests! There is scarcely any example of anyone deliberately adding
test coverage to that file. I don't understand why it is so.

I don't know, either. You read both Robert and me indicating that this bug
fix will be better with a test, and nobody has said that a test-free fix is
optimal. Here's your chance to obliterate that no-tests precedent.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Peter Geoghegan
pg@heroku.com
In reply to: Noah Misch (#10)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 8:37 PM, Noah Misch <noah@leadboat.com> wrote:

Yes, please produce a patch version bearing a regression test that exercises
the bug. I feel it counts even if the you need Valgrind Memcheck to see that
it exercises the bug, but I won't interfere if Robert disagrees. The test
should take moderate steps to minimize costs. For example, it should decrease
maintenance_work_mem, not sort >16 MiB of data.

I agree with both points. I think that we can do just as well with a
1MiB maintenance_work_mem setting.

PostgreSQL is open to moving features from zero test coverage to nonzero test
coverage. The last several releases have each done so. Does that
sufficiently clarify the policy landscape?

Not quite, no. There are costs associated with adding regression tests
that exercise external sorting. What are the costs, and how are they
felt? How can we add more extensive test coverage without burdening
those that run extremely slow buildfarm animals, for example?

I think that a new tuplesort.sql test file is where a test like this
belongs (not in the existing cluster.sql test file). If I write a
patch along those lines, is it ever going to be accepted? I would be
happy to work on this, but we need to have a sensible conversation on
what's acceptable to everyone in this area first. I've screamed bloody
murder about the test coverage in this area before, and nothing
happened.

I'll guess that Robert would prefer a minimal test in cluster.sql over
starting a new file. If you want to be sure, wait for his input.

There are significant advantages to a tuplesort.sql file. It gives us
an easy way to avoid running the tests where they are inappropriate.
Also, there are reasons to prefer a datum or heap tuplesort for
testing certain facets of external sorting: setting work_mem to 64KiB
can allow a test that exercises multiple polyphase merge passes and is
relative fast (fast compared to a CLUSTER-based test, with 1MiB of
maintenance_work_mem, say).

It's also true that there are special considerations for both hash
tuplesorts and datum tuplesorts. As recently as a week ago, hash
tuplesorts were fixed by Tom, having been *completely* broken for
about one week shy of an entire year. I plan to address that
separately, per discussion with Tom.

I don't know, either. You read both Robert and me indicating that this bug
fix will be better with a test, and nobody has said that a test-free fix is
optimal. Here's your chance to obliterate that no-tests precedent.

I'm happy that I can do at least that much, but I see no reason to not
go significantly further.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12Noah Misch
noah@leadboat.com
In reply to: Peter Geoghegan (#11)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 01, 2016 at 09:12:42PM -0700, Peter Geoghegan wrote:

On Fri, Jul 1, 2016 at 8:37 PM, Noah Misch <noah@leadboat.com> wrote:

PostgreSQL is open to moving features from zero test coverage to nonzero test
coverage. The last several releases have each done so. Does that
sufficiently clarify the policy landscape?

Not quite, no. There are costs associated with adding regression tests
that exercise external sorting. What are the costs, and how are they
felt? How can we add more extensive test coverage without burdening
those that run extremely slow buildfarm animals, for example?

Project policy is silent on those matters. If your test performs one sort on
a table <3 MiB, I expect no trouble.

I'll guess that Robert would prefer a minimal test in cluster.sql over
starting a new file. If you want to be sure, wait for his input.

There are significant advantages to a tuplesort.sql file. It gives us
an easy way to avoid running the tests where they are inappropriate.
Also, there are reasons to prefer a datum or heap tuplesort for
testing certain facets of external sorting: setting work_mem to 64KiB
can allow a test that exercises multiple polyphase merge passes and is
relative fast (fast compared to a CLUSTER-based test, with 1MiB of
maintenance_work_mem, say).

It's also true that there are special considerations for both hash
tuplesorts and datum tuplesorts. As recently as a week ago, hash
tuplesorts were fixed by Tom, having been *completely* broken for
about one week shy of an entire year. I plan to address that
separately, per discussion with Tom.

I don't know, either. You read both Robert and me indicating that this bug
fix will be better with a test, and nobody has said that a test-free fix is
optimal. Here's your chance to obliterate that no-tests precedent.

I'm happy that I can do at least that much, but I see no reason to not
go significantly further.

Don't risk bundling tests for other sorting scenarios. A minimal test for the
bug in question helps to qualify your patch as an exemplary bug fix. Broader
test coverage evicts your patch from that irreproachable category, inviting
reviewers to determine that you went too far.

nm

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Peter Geoghegan
pg@heroku.com
In reply to: Noah Misch (#12)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 10:01 PM, Noah Misch <noah@leadboat.com> wrote:

I'm happy that I can do at least that much, but I see no reason to not
go significantly further.

Don't risk bundling tests for other sorting scenarios. A minimal test for the
bug in question helps to qualify your patch as an exemplary bug fix. Broader
test coverage evicts your patch from that irreproachable category, inviting
reviewers to determine that you went too far.

I have no intention of attempting to shoehorn tests for other sorting
scenarios into this bugfix. But, I will wait for Robert to comment on
what I've said before acting.

We should be more ambitious about adding test coverage to tuplesort.c.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Jim Nasby
Jim.Nasby@BlueTreble.com
In reply to: Peter Geoghegan (#13)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On 7/2/16 12:40 AM, Peter Geoghegan wrote:

We should be more ambitious about adding test coverage to tuplesort.c.

IMHO, s/ to tuplesort.c//, at least for the buildfarm.

TAP tests don't run by default, so maybe that's the place to start
"going crazy" with adding more testing. Though I do think something
that's sorely needed is the ability to test stuff at the C level.
Sometimes SQL is jut too high a level to verify things.
--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com
855-TREBLE2 (855-873-2532) mobile: 512-569-9461

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#11)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

Peter Geoghegan <pg@heroku.com> writes:

On Fri, Jul 1, 2016 at 8:37 PM, Noah Misch <noah@leadboat.com> wrote:

PostgreSQL is open to moving features from zero test coverage to nonzero test
coverage. The last several releases have each done so. Does that
sufficiently clarify the policy landscape?

Not quite, no. There are costs associated with adding regression tests
that exercise external sorting. What are the costs, and how are they
felt? How can we add more extensive test coverage without burdening
those that run extremely slow buildfarm animals, for example?

As the owner of several slow buildfarm critters, and as somebody who runs
the regression tests manually many times on a typical work day, I do have
concerns about that ;-). I agree that improving code coverage of our
tests is a good goal, but I think we need to take steps to make sure that
we don't increase the runtime of the regression tests too much.

I suggest that we should not take the existing code behavior as something
immutable if it makes it hard to create tests. Peter and I already
discussed changing the behavior of hash CREATE INDEX to make its sort code
path more easily reachable, and perhaps there are similar things that
could be done elsewhere. For instance, I do not believe that the minimum
value of maintenance_work_mem was ever graven on a stone tablet; if it
would help to reduce that, let's reduce it.

I think that a new tuplesort.sql test file is where a test like this
belongs (not in the existing cluster.sql test file).

If you are thinking of setting it up like numeric_big, I'm not terribly
excited about that, because to the best of my recollection numeric_big
has never identified a bug. It doesn't get run often enough to have
much chance of that.

It's possible that it'd be sensible to have a mechanism like that but
make it opt-out rather than opt-in; that is, the buildfarm critters
would run all tests by default, but owners of slow animals could set
some flag to skip longer tests. Don't know how much new infrastructure
that would take, or whether it's worth the trouble. But I generally
don't believe that long tests are good just for being long. If something
is slow enough that people start taking measures to avoid running it,
that's a net loss not a net win.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Robert Haas
robertmhaas@gmail.com
In reply to: Noah Misch (#10)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Fri, Jul 1, 2016 at 11:37 PM, Noah Misch <noah@leadboat.com> wrote:

I don't know, either. You read both Robert and me indicating that this bug
fix will be better with a test, and nobody has said that a test-free fix is
optimal. Here's your chance to obliterate that no-tests precedent.

In the interest of clarity, I was not intending to say that there
should be a regression test in the patch. I was intending to say that
there should be a test case with the bug report. I'm not opposed to
adding a regression test, and I like the idea of attempting to do so
while requiring only a relatively small amount of data by changing
maintenance_work_mem, but that wasn't the target at which I was
aiming. Nevertheless, carry on.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17Peter Geoghegan
pg@heroku.com
In reply to: Robert Haas (#16)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Sat, Jul 2, 2016 at 3:17 PM, Robert Haas <robertmhaas@gmail.com> wrote:

In the interest of clarity, I was not intending to say that there
should be a regression test in the patch. I was intending to say that
there should be a test case with the bug report. I'm not opposed to
adding a regression test, and I like the idea of attempting to do so
while requiring only a relatively small amount of data by changing
maintenance_work_mem, but that wasn't the target at which I was
aiming. Nevertheless, carry on.

How do you feel about adding testing to tuplesort.c not limited to
hitting this bug (when Valgrind memcheck is used)?

Are you satisfied that I have adequately described steps to reproduce?

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Noah Misch
noah@leadboat.com
In reply to: Peter Geoghegan (#17)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

This PostgreSQL 9.6 open item is past due for a status update.

On Sat, Jul 02, 2016 at 08:47:20PM -0700, Peter Geoghegan wrote:

On Sat, Jul 2, 2016 at 3:17 PM, Robert Haas <robertmhaas@gmail.com> wrote:

In the interest of clarity, I was not intending to say that there
should be a regression test in the patch. I was intending to say that
there should be a test case with the bug report. I'm not opposed to
adding a regression test, and I like the idea of attempting to do so
while requiring only a relatively small amount of data by changing
maintenance_work_mem, but that wasn't the target at which I was
aiming. Nevertheless, carry on.

How do you feel about adding testing to tuplesort.c not limited to
hitting this bug (when Valgrind memcheck is used)?

Sounds great, but again, not in the patch fixing this bug.

Are you satisfied that I have adequately described steps to reproduce?

I can confirm that (after 62 minutes) your test procedure reached SIGSEGV
today and then completed successfully with your patch.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19Peter Geoghegan
pg@heroku.com
In reply to: Noah Misch (#18)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Thu, Jul 7, 2016 at 12:34 AM, Noah Misch <noah@leadboat.com> wrote:

How do you feel about adding testing to tuplesort.c not limited to
hitting this bug (when Valgrind memcheck is used)?

Sounds great, but again, not in the patch fixing this bug.

I'll work on a minimal CLUSTER testcase within the next day or two,
and post a revision. Separately, I'll propose a patch that further
expands tuplesort test coverage. This will add coverage for hash
tuplesorts, as well as coverage of external sorts that require
multiple passes.

Are you satisfied that I have adequately described steps to reproduce?

I can confirm that (after 62 minutes) your test procedure reached SIGSEGV
today and then completed successfully with your patch.

Thanks for going to the trouble of confirming that the test procedure
causes a segmentation fault, and that my patch appears to fix the
issue.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20Robert Haas
robertmhaas@gmail.com
In reply to: Noah Misch (#18)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Thu, Jul 7, 2016 at 3:34 AM, Noah Misch <noah@leadboat.com> wrote:

Are you satisfied that I have adequately described steps to reproduce?

I can confirm that (after 62 minutes) your test procedure reached SIGSEGV
today and then completed successfully with your patch.

Thanks for testing. I've committed this patch, breaking off one
unrelated bit of into a separate commit.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21Peter Geoghegan
pg@heroku.com
In reply to: Robert Haas (#20)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Thu, Jul 7, 2016 at 10:51 AM, Robert Haas <robertmhaas@gmail.com> wrote:

Thanks for testing. I've committed this patch, breaking off one
unrelated bit of into a separate commit.

Thank you. To be clear, I still intend to follow up with a CLUSTER
external sort test case, as outlined to Noah.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22Peter Geoghegan
pg@heroku.com
In reply to: Noah Misch (#18)
1 attachment(s)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Thu, Jul 7, 2016 at 12:34 AM, Noah Misch <noah@leadboat.com> wrote:

How do you feel about adding testing to tuplesort.c not limited to
hitting this bug (when Valgrind memcheck is used)?

Sounds great, but again, not in the patch fixing this bug.

Attached patch adds a CLUSTER external sort test case to the
regression tests, as discussed.

This makes a hardly noticeable difference on the run time of the
CLUSTER tests, at least for me. Consider the following:

$ time make installcheck-tests TESTS="create_function_1 create_type
create_table copy cluster"
*** SNIP ***
(using postmaster on Unix socket, default port)
============== dropping database "regression" ==============
DROP DATABASE
============== creating database "regression" ==============
CREATE DATABASE
ALTER DATABASE
============== running regression test queries ==============
test create_function_1 ... ok
test create_type ... ok
test create_table ... ok
test copy ... ok
test cluster ... ok

=====================
All 5 tests passed.
=====================

make[1]: Leaving directory '/home/pg/postgresql/src/test/regress'
make installcheck-tests 0.20s user 0.03s system 16% cpu 1.422 total

The added overhead is discernible from the noise for me, but not by
much. If I had to put a number on it, I'd say that these new additions
make the regression tests take approximately 120 milliseconds longer
to complete on a fast machine. And yet, the new tests add test
coverage for:

* Multiple pass external sorts. With only 1MB of maintenance_work_mem,
only the minimum number of tapes, 7, are used, so it's really easy to
see multiple merge passes with only 6 input tapes (the wide caller
tuples used by this CLUSTER case also help). The final on-the-fly
merge still uses batch memory, of course.

* This CLUSTER external sort bug -- I've verified that the new
movetup_cluster() function is hit (it's hit over 10 times, in fact),
and so the Valgrind animal ("Skink") would have caught the CLUSTER bug
had this testing been in place earlier.

* The best case for replacement selection, where only one run is
produced, and so no merge is required. (So, there are 2 CLUSTER
operations that perform external sort operations added in total.)

* Due to hitting that replacement selection best case,
TSS_SORTEDONTAPE tuple handling also gets some coverage, and so (since
we also test multiple passes) we perhaps manage to cover all code used
for the randomAccess/materialized tape as final output case, without
directly testing a randomAccess caller case separately (direct testing
would not be possible with CLUSTER, which never cares about getting
randomAccess to the final output).

Overall, significant test coverage is added, with only a small impact
on test runtime.

It seemed to not make much sense to produce separate patches to do
this much. As discussed, I intend to produce more test coverage still,
including coverage of hash index build tuplesorts. I hope Noah does
not imagine that I disregarded his remarks on doing more than the
minimum in my first pass. I would have to increase
maintenance_work_mem to not get multiple passes on of CLUSTER of any
conveniently available regression test table. With 2MB of
maintenance_work_mem, a one pass sort of 3 runs is all that is
required (and certainly not multiple passes). With 6MB of
maintenance_work_mem, the sort completes in memory. There seems to be
little reason to not do this much at once.

Testing replacement selection in the second CLUSTER is made very
convenient by the fact that we just ran CLUSTER, so input should be
presorted.

--
Peter Geoghegan

Attachments:

0001-Add-test-coverage-for-CLUSTER-external-sorts.patch.gzapplication/x-gzip; name=0001-Add-test-coverage-for-CLUSTER-external-sorts.patch.gzDownload
#23Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#22)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

Peter Geoghegan <pg@heroku.com> writes:

Attached patch adds a CLUSTER external sort test case to the
regression tests, as discussed.

I took a quick look at this patch.

This makes a hardly noticeable difference on the run time of the
CLUSTER tests, at least for me. Consider the following:

I tried the patch on prairiedog's host. That's not the slowest
machine in the buildfarm, but it's right down there. The patch
increases the runtime of the "cluster" test from ~1 sec to ~3 sec,
which I agree is pretty negligible (total time for the standard
regression tests is ~5 min on this machine). That seems a cheap
price to pay for a significant improvement in code coverage.

What I don't much like is that it enlarges cluster.out with 200K of
random-looking, hard-to-manually-verify data. May I suggest that
we replace the SELECTs with

select * from
(select hundred, lag(hundred) over () as lhundred,
thousand, lag(thousand) over () as lthousand,
tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss
where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
hundred | lhundred | thousand | lthousand | tenthous | ltenthous
---------+----------+----------+-----------+----------+-----------
(0 rows)

If you're good with that adjustment, I'm happy to commit this.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#24Peter Geoghegan
pg@heroku.com
In reply to: Tom Lane (#23)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Wed, Jul 13, 2016 at 11:44 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

What I don't much like is that it enlarges cluster.out with 200K of
random-looking, hard-to-manually-verify data. May I suggest that
we replace the SELECTs with

select * from
(select hundred, lag(hundred) over () as lhundred,
thousand, lag(thousand) over () as lthousand,
tenthous, lag(tenthous) over () as ltenthous from clstr_4) ss
where row(hundred, thousand, tenthous) <= row(lhundred, lthousand, ltenthous);
hundred | lhundred | thousand | lthousand | tenthous | ltenthous
---------+----------+----------+-----------+----------+-----------
(0 rows)

It independently occurred to me that I should have done something like
this afterwards. I agree.

If you're good with that adjustment, I'm happy to commit this.

I am happy with the adjustment. Please commit the adjusted patch.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Geoghegan (#24)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

Peter Geoghegan <pg@heroku.com> writes:

On Wed, Jul 13, 2016 at 11:44 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

If you're good with that adjustment, I'm happy to commit this.

I am happy with the adjustment. Please commit the adjusted patch.

Done with minor adjustments.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#26Peter Geoghegan
pg@heroku.com
In reply to: Tom Lane (#25)
Re: Bug in batch tuplesort memory CLUSTER case (9.6 only)

On Wed, Jul 13, 2016 at 12:24 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

I am happy with the adjustment. Please commit the adjusted patch.

Done with minor adjustments.

Thanks. I'm pleased that we found a way forward that addressed every concern.

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers