From a45b6cb684a2f9ebae5f194e2af5d6e85c8b7dc5 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Wed, 29 Jul 2015 15:38:12 -0700
Subject: [PATCH 1/5] Quicksort when performing external sorts

Add "quicksort with spillover".  This allows an external sort that is
within about 2x of work_mem to avoid writing out most tuples, and to
quicksort perhaps almost all tuples rather than performing a degenerate
heapsort.  Often, an external sort is now only marginally more expensive
than an internal sort, which is a significant improvement.  Sort
performance is made much more predictable as tables gradually increase
in size.

In addition, have tuplesort give up on replacement selection after the
first run, where it is generally not the fastest approach.  Most of the
benefits of replacement selection are seen where incremental spilling
rather than spilling in batches allows a "quicksort with spillover" to
ultimately write almost no tuples out.

When a second or subsequent run is necessary (rather than preliminarily
appearing necessary, something a "quicksort with spillover" is often
able to disregard), the second and subsequent run tuples are simply
stored in no particular order initially, and finally quicksorted and
dumped in batch when work_mem is once again exhausted.  Testing has
shown this to be much faster in many realistic cases, although there is
no saving in I/O.  Overall, cache efficiency ought to be the dominant
consideration when engineering an external sort routine targeting modern
hardware.
---
 src/backend/commands/explain.c     |  13 +-
 src/backend/utils/sort/tuplesort.c | 573 ++++++++++++++++++++++++++++++++-----
 src/include/utils/tuplesort.h      |   3 +-
 3 files changed, 519 insertions(+), 70 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 5d06fa4..94b1f77 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2178,20 +2178,27 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		const char *sortMethod;
 		const char *spaceType;
 		long		spaceUsed;
+		int			rowsSortedMem;
 
-		tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed);
+		tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed,
+							&rowsSortedMem);
 
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
-			appendStringInfo(es->str, "Sort Method: %s  %s: %ldkB\n",
-							 sortMethod, spaceType, spaceUsed);
+			appendStringInfo(es->str,
+							 "Sort Method: %s  %s: %ldkB  Rows In Memory: %d\n",
+							 sortMethod,
+							 spaceType,
+							 spaceUsed,
+							 rowsSortedMem);
 		}
 		else
 		{
 			ExplainPropertyText("Sort Method", sortMethod, es);
 			ExplainPropertyLong("Sort Space Used", spaceUsed, es);
 			ExplainPropertyText("Sort Space Type", spaceType, es);
+			ExplainPropertyInteger("Rows In Memory", rowsSortedMem, es);
 		}
 	}
 }
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d532e87..55b0583 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -8,15 +8,16 @@
  * if necessary).  It works efficiently for both small and large amounts
  * of data.  Small amounts are sorted in-memory using qsort().  Large
  * amounts are sorted using temporary files and a standard external sort
- * algorithm.
+ * algorithm with numerous special enhancements.
  *
  * See Knuth, volume 3, for more than you want to know about the external
  * sorting algorithm.  We divide the input into sorted runs using replacement
  * selection, in the form of a priority tree implemented as a heap
- * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
- * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
- * are implemented by logtape.c, which avoids space wastage by recycling
- * disk space as soon as each block is read from its "tape".
+ * (essentially his Algorithm 5.2.3H -- although that strategy can be
+ * abandoned where it does not appear to help), then merge the runs using
+ * polyphase merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by
+ * Algorithm D are implemented by logtape.c, which avoids space wastage by
+ * recycling disk space as soon as each block is read from its "tape".
  *
  * We do not form the initial runs using Knuth's recommended replacement
  * selection data structure (Algorithm 5.4.1R), because it uses a fixed
@@ -72,6 +73,15 @@
  * to one run per logical tape.  The final merge is then performed
  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
  * saves one cycle of writing all the data out to disk and reading it in.
+ * Also, if only one run is spilled to tape so far when
+ * tuplesort_performsort() is reached, and if the caller does not require
+ * random access, then the merge step can take place between still
+ * in-memory tuples, and tuples stored on tape (it does not matter that
+ * there may be a second run in that array -- only that a second one has
+ * spilled).  This ensures that spilling to disk only occurs for a number of
+ * tuples approximately equal to the number of tuples read in after
+ * work_mem was reached and it became apparent that an external sort is
+ * required.
  *
  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
@@ -86,6 +96,23 @@
  * we preread from a tape, so as to maintain the locality of access described
  * above.  Nonetheless, with large workMem we can have many tapes.
  *
+ * Before Postgres 9.6, we always used a heap for replacement selection when
+ * building runs.  However, Knuth does not consider the influence of memory
+ * access on overall performance, which is a crucial consideration on modern
+ * machines;  replacement selection is only really of value where a single
+ * run or two runs can be produced, sometimes avoiding a merge step
+ * entirely.  Replacement selection makes this likely when tuples are read
+ * in approximately logical order, even if work_mem is only a small fraction
+ * of the requirement for an internal sort, but large main memory sizes
+ * don't benefit from tiny, incremental spills, even with enormous datasets.
+ * If, having maintained a replacement selection priority queue (heap) for
+ * the first run it transpires that there will be multiple on-tape runs
+ * anyway, we abandon treating memtuples as a heap, and quicksort and write
+ * in memtuples-sized batches.  This gives us most of the advantages of
+ * always quicksorting and batch dumping runs, which can perform much better
+ * than heap sorting and incrementally spilling tuples, without giving up on
+ * replacement selection in cases where it remains compelling.
+ *
  *
  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -160,7 +187,10 @@ bool		optimize_bounded_sort = true;
  * described above.  Accordingly, "tuple" is always used in preference to
  * datum1 as the authoritative value for pass-by-reference cases.
  *
- * While building initial runs, tupindex holds the tuple's run number.  During
+ * While building initial runs, tupindex holds the tuple's run number.
+ * Historically, the run number could meaningfully distinguish many runs, but
+ * currently it only meaningfully distinguishes the first run with any other
+ * run, since replacement selection is abandoned after the first run.  During
  * merge passes, we re-use it to hold the input tape number that each tuple in
  * the heap was read from, or to hold the index of the next tuple pre-read
  * from the same tape in the case of pre-read entries.  tupindex goes unused
@@ -186,6 +216,7 @@ typedef enum
 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
+	TSS_MEMTAPEMERGE,			/* Performing memory/tape merge on-the-fly */
 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
 } TupSortStatus;
 
@@ -280,6 +311,13 @@ struct Tuplesortstate
 	bool		growmemtuples;	/* memtuples' growth still underway? */
 
 	/*
+	 * While building initial runs, this indicates if the replacement
+	 * selection strategy or simple hybrid sort-merge strategy is in use.
+	 * Replacement selection is abandoned after first run.
+	 */
+	bool		replaceActive;
+
+	/*
 	 * While building initial runs, this is the current output run number
 	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
 	 */
@@ -327,12 +365,22 @@ struct Tuplesortstate
 	int			activeTapes;	/* # of active input tapes in merge pass */
 
 	/*
+	 * These variables are used after tuplesort_performsort() for the
+	 * TSS_MEMTAPEMERGE case.  This is a special, optimized final on-the-fly
+	 * merge pass involving merging the result tape with memtuples that were
+	 * quicksorted (but never made it out to a tape).
+	 */
+	SortTuple	tape_cache;		/* cached tape tuple from prior call */
+	bool		cached;			/* tape_cache holds pending tape tuple */
+	bool		just_memtuples;	/* merge only fetching from memtuples */
+
+	/*
 	 * These variables are used after completion of sorting to keep track of
 	 * the next tuple to return.  (In the tape case, the tape's current read
 	 * position is also critical state.)
 	 */
 	int			result_tape;	/* actual tape number of finished output */
-	int			current;		/* array index (only used if SORTEDINMEM) */
+	int			current;		/* memtuples array index */
 	bool		eof_reached;	/* reached EOF (needed for cursors) */
 
 	/* markpos_xxx holds marked position for mark and restore */
@@ -464,12 +512,15 @@ static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
 static void mergeonerun(Tuplesortstate *state);
+static void mergememruns(Tuplesortstate *state);
 static void beginmerge(Tuplesortstate *state);
 static void mergepreread(Tuplesortstate *state);
 static void mergeprereadone(Tuplesortstate *state, int srcTape);
 static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void dumpbatch(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
+static void tuplesort_quicksort(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 					  int tupleindex, bool checkIndex);
 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@@ -1486,22 +1537,61 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 
 			/*
 			 * Insert the tuple into the heap, with run number currentRun if
-			 * it can go into the current run, else run number currentRun+1.
-			 * The tuple can go into the current run if it is >= the first
-			 * not-yet-output tuple.  (Actually, it could go into the current
-			 * run if it is >= the most recently output tuple ... but that
-			 * would require keeping around the tuple we last output, and it's
-			 * simplest to let writetup free each tuple as soon as it's
-			 * written.)
+			 * it can go into the current run, else run number INT_MAX (some
+			 * later run).  The tuple can go into the current run if it is
+			 * >= the first not-yet-output tuple.  (Actually, it could go
+			 * into the current run if it is >= the most recently output
+			 * tuple ... but that would require keeping around the tuple we
+			 * last output, and it's simplest to let writetup free each
+			 * tuple as soon as it's written.)
 			 *
-			 * Note there will always be at least one tuple in the heap at
-			 * this point; see dumptuples.
+			 * Note that this only applies if the currentRun is 0 (prior to
+			 * giving up on heapification).  There is no meaningful
+			 * distinction between any two runs in memory except the first
+			 * and second run.  When the currentRun is not 0, there is no
+			 * guarantee that any tuples are already stored in memory here,
+			 * and if there are any they're in no significant order.
 			 */
-			Assert(state->memtupcount > 0);
-			if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			Assert(!state->replaceActive || state->memtupcount > 0);
+			if (state->replaceActive &&
+				COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			{
+				/*
+				 * Unlike classic replacement selection, which this module was
+				 * previously based on, only run 0 is treated as a priority
+				 * queue through heapification.  The second run (run 1) is
+				 * appended indifferently below, and will never be trusted to
+				 * maintain the heap invariant beyond simply not getting in
+				 * the way of spilling run 0 incrementally.  In other words,
+				 * second run tuples may be sifted out of the way of first
+				 * run tuples; COMPARETUP() will never be called for run
+				 * 1 tuples.  However, not even HEAPCOMPARE() will be
+				 * called for a subsequent run's tuples.
+				 */
 				tuplesort_heap_insert(state, tuple, state->currentRun, true);
+			}
 			else
-				tuplesort_heap_insert(state, tuple, state->currentRun + 1, true);
+			{
+				/*
+				 * Note that unlike Knuth, we do not care about the second
+				 * run's tuples when loading runs.  After the first run is
+				 * complete, tuples will not be dumped incrementally at all,
+				 * but as long as the first run (run 0) is current it will
+				 * be maintained.  dumptuples does not trust that the second
+				 * or subsequent runs are heapified (beyond merely not
+				 * getting in the way of the first, fully heapified run,
+				 * which only matters for the second run, run 1).  Anything
+				 * past the first run will be quicksorted.
+				 *
+				 * Past the first run, there is no need to differentiate runs
+				 * in memory (only the first and second runs will ever be
+				 * usefully differentiated).  Use a generic INT_MAX run
+				 * number (just to be tidy).  There should always be room to
+				 * store the incoming tuple.
+				 */
+				tuple->tupindex = INT_MAX;
+				state->memtuples[state->memtupcount++] = *tuple;
+			}
 
 			/*
 			 * If we are over the memory limit, dump tuples till we're under.
@@ -1576,20 +1666,9 @@ tuplesort_performsort(Tuplesortstate *state)
 
 			/*
 			 * We were able to accumulate all the tuples within the allowed
-			 * amount of memory.  Just qsort 'em and we're done.
+			 * amount of memory.  Just quicksort 'em and we're done.
 			 */
-			if (state->memtupcount > 1)
-			{
-				/* Can we use the single-key sort function? */
-				if (state->onlyKey != NULL)
-					qsort_ssup(state->memtuples, state->memtupcount,
-							   state->onlyKey);
-				else
-					qsort_tuple(state->memtuples,
-								state->memtupcount,
-								state->comparetup,
-								state);
-			}
+			tuplesort_quicksort(state);
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@@ -1616,12 +1695,26 @@ tuplesort_performsort(Tuplesortstate *state)
 
 			/*
 			 * Finish tape-based sort.  First, flush all tuples remaining in
-			 * memory out to tape; then merge until we have a single remaining
-			 * run (or, if !randomAccess, one run per tape). Note that
-			 * mergeruns sets the correct state->status.
+			 * memory out to tape where that's required (when more than one
+			 * run's tuples made it to tape, or when the caller required
+			 * random access).  Then, either merge until we have a single
+			 * remaining run on tape, or merge runs in memory by sorting
+			 * them into one single in-memory run.  Note that
+			 * mergeruns/mergememruns sets the correct state->status.
 			 */
-			dumptuples(state, true);
-			mergeruns(state);
+			if (state->currentRun > 0 || state->randomAccess)
+			{
+				dumptuples(state, true);
+				mergeruns(state);
+			}
+			else
+			{
+				/*
+				 * Only possible for !randomAccess callers, just as with
+				 * tape based on-the-fly merge
+				 */
+				mergememruns(state);
+			}
 			state->eof_reached = false;
 			state->markpos_block = 0L;
 			state->markpos_offset = 0;
@@ -1640,6 +1733,9 @@ tuplesort_performsort(Tuplesortstate *state)
 			elog(LOG, "performsort done (except %d-way final merge): %s",
 				 state->activeTapes,
 				 pg_rusage_show(&state->ru_start));
+		else if (state->status == TSS_MEMTAPEMERGE)
+			elog(LOG, "performsort done (except memory/tape final merge): %s",
+				 pg_rusage_show(&state->ru_start));
 		else
 			elog(LOG, "performsort done: %s",
 				 pg_rusage_show(&state->ru_start));
@@ -1791,6 +1887,118 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			READTUP(state, stup, state->result_tape, tuplen);
 			return true;
 
+		case TSS_MEMTAPEMERGE:
+			Assert(forward);
+			/* For now, assume tuple returned from memory */
+			*should_free = false;
+
+			/*
+			 * Should be at least one memtuple (work_mem should be roughly
+			 * fully consumed)
+			 */
+			Assert(state->memtupcount > 0);
+
+			if (state->eof_reached)
+				return false;
+
+			if (state->just_memtuples)
+				goto just_memtuples;
+
+			/*
+			 * Merge together quicksorted memtuples array, and sorted tape.
+			 *
+			 * When this optimization was initially applied, the array was
+			 * heapified.  Some number of tuples were spilled to disk from the
+			 * top of the heap irregularly, and are read from tape here in
+			 * fully sorted order.  memtuples usually originally contains 2
+			 * runs, though, so we merge it with the on-tape run.
+			 * (Quicksorting effectively merged the 2 in-memory runs into one
+			 * in-memory run already)
+			 *
+			 * Exhaust the supply of tape tuples first.
+			 *
+			 * "stup" is always initially set to the current tape tuple if
+			 * any remain, which may be cached from previous call, or read
+			 * from tape when nothing cached.
+			 */
+			if (state->cached)
+				*stup = state->tape_cache;
+			else if ((tuplen = getlen(state, state->result_tape, true)) != 0)
+				READTUP(state, stup, state->result_tape, tuplen);
+			else
+			{
+				/* Supply of tape tuples was just exhausted */
+				state->just_memtuples = true;
+				goto just_memtuples;
+			}
+
+			/*
+			 * Kludge:  Trigger abbreviated tie-breaker if in-memory tuples
+			 * use abbreviation (writing tuples to tape never preserves
+			 * abbreviated keys).  Do this by assigning in-memory
+			 * abbreviated tuple to tape tuple directly.
+			 *
+			 * It doesn't seem worth generating a new abbreviated key for
+			 * the tape tuple, and this approach is simpler than
+			 * "unabbreviating" the memtuple tuple from a "common" routine
+			 * like this.
+			 *
+			 * In the future, this routine could offer an API that allows
+			 * certain clients (like ordered set aggregate callers) to
+			 * cheaply test *inequality* across adjacent pairs of sorted
+			 * tuples on the basis of simple abbreviated key binary
+			 * inequality.  Another advantage of this approach is that that
+			 * can still work without reporting to clients that abbreviation
+			 * wasn't used.  The tape tuples might only be a small minority
+			 * of all tuples returned.
+			 */
+			if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
+				stup->datum1 = state->memtuples[state->current].datum1;
+
+			/*
+			 * Compare current tape tuple to current memtuple.
+			 *
+			 * Since we always start with at least one memtuple, and since tape
+			 * tuples are always returned before equal memtuples, it follows
+			 * that there must be at least one memtuple left to return here.
+			 */
+			Assert(state->current < state->memtupcount);
+
+			if (COMPARETUP(state, stup, &state->memtuples[state->current]) <= 0)
+			{
+				/*
+				 * Tape tuple less than or equal to memtuple array current
+				 * position.  Return it.
+				 */
+				state->cached = false;
+				/* Caller can free tape tuple memory */
+				*should_free = true;
+			}
+			else
+			{
+				/*
+				 * Tape tuple greater than memtuple array's current tuple.
+				 *
+				 * Return current memtuple tuple, and cache tape tuple for
+				 * next call.  It will be returned on next or subsequent
+				 * call.
+				 */
+				state->tape_cache = *stup;
+				state->cached = true;
+				*stup = state->memtuples[state->current++];
+			}
+			return true;
+
+just_memtuples:
+			/* Just return memtuples -- merging done */
+			if (state->current < state->memtupcount)
+			{
+				*stup = state->memtuples[state->current++];
+				return true;
+			}
+			state->eof_reached = true;
+			return false;
+
 		case TSS_FINALMERGE:
 			Assert(forward);
 			*should_free = true;
@@ -2000,6 +2208,7 @@ tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
 			return false;
 
 		case TSS_SORTEDONTAPE:
+		case TSS_MEMTAPEMERGE:
 		case TSS_FINALMERGE:
 
 			/*
@@ -2129,6 +2338,15 @@ inittapes(Tuplesortstate *state)
 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
 
 	/*
+	 * Give replacement selection a try.  There will be a switch to a simple
+	 * hybrid sort-merge strategy after the first run (iff there is to be a
+	 * second on-tape run).
+	 */
+	state->replaceActive = true;
+	state->cached = false;
+	state->just_memtuples = false;
+
+	/*
 	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
 	 * marked as belonging to run number zero.
 	 *
@@ -2225,6 +2443,14 @@ mergeruns(Tuplesortstate *state)
 	 * volume is between 1X and 2X workMem), we can just use that tape as the
 	 * finished output, rather than doing a useless merge.  (This obvious
 	 * optimization is not in Knuth's algorithm.)
+	 *
+	 * This should almost be dead code in practice because replacement
+	 * selection will not be allowed to continue by tuplesort_performsort().
+	 * It is perhaps still possible that certain edge-cases will leave the
+	 * final dumptuples() call as a no-op, resulting in only one run
+	 * remaining on tape, allowing this optimization to still be used.
+	 * Before the role of replacement selection was significantly diminished
+	 * to allow quicksorting of runs, this code was more useful.
 	 */
 	if (state->currentRun == 1)
 	{
@@ -2426,6 +2652,68 @@ mergeonerun(Tuplesortstate *state)
 }
 
 /*
+ * mergememruns -- merge runs in memory into a new in-memory run.
+ *
+ * This allows tuplesort to avoid dumping many tuples in the common case
+ * where work_mem is less than 2x the amount required for an internal sort
+ * ("quicksort with spillover").  This optimization does not appear in
+ * Knuth's algorithm.
+ *
+ * Merging here actually means quicksorting, without regard to the run
+ * number of each memtuple.  Note that this in-memory merge is distinct from
+ * the final on-the-fly merge step that follows.  This routine merges what
+ * was originally the tail of the first run with what was originally the
+ * entire second run in advance of the on-the-fly merge step (sometimes,
+ * there will only be one run in memory, but sorting is still required).
+ * The final on-the-fly merge occurs between the new all-in-memory run
+ * created by this routine, and what was originally the first part of the
+ * first run (and is now simply the first run), which is already sorted on
+ * tape.
+ *
+ * The fact that the memtuples array has already been heapified (within the
+ * first run) is no reason to commit to the path of unnecessarily dumping
+ * and heapsorting input tuples.  Often, memtuples will be much larger than
+ * the final on-tape run, which is where this optimization is most
+ * effective.
+ */
+static void
+mergememruns(Tuplesortstate *state)
+{
+	Assert(state->replaceActive);
+	Assert(!state->randomAccess);
+
+	/*
+	 * It doesn't seem worth being more clever in the cases where there is
+	 * no second run pending (i.e. no tuple that didn't belong to the
+	 * original/first "currentRun") just to avoid a memory/tape final
+	 * on-the-fly merge step, although that might be possible with care.
+	 */
+	markrunend(state, state->currentRun);
+
+	/*
+	 * The usual path for quicksorting runs (quicksort just before dumping
+	 * all tuples) was avoided by caller, so quicksort to merge.
+	 *
+	 * Note that this may use abbreviated keys, which are no longer
+	 * available for the tuples that spilled to tape.  This is something
+	 * that the final on-the-fly merge step accounts for.
+	 */
+	tuplesort_quicksort(state);
+	state->current = 0;
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished quicksort of %d tuples to create single in-memory run: %s",
+			 state->memtupcount, pg_rusage_show(&state->ru_start));
+#endif
+
+	state->result_tape = state->tp_tapenum[state->destTape];
+	/* Must freeze and rewind the finished output tape */
+	LogicalTapeFreeze(state->tapeset, state->result_tape);
+	state->status = TSS_MEMTAPEMERGE;
+}
+
+/*
  * beginmerge - initialize for a merge pass
  *
  * We decrease the counts of real and dummy runs for each tape, and mark
@@ -2604,21 +2892,23 @@ mergeprereadone(Tuplesortstate *state, int srcTape)
 }
 
 /*
- * dumptuples - remove tuples from heap and write to tape
+ * dumptuples - remove tuples from memtuples and write to tape
  *
  * This is used during initial-run building, but not during merging.
  *
- * When alltuples = false, dump only enough tuples to get under the
- * availMem limit (and leave at least one tuple in the heap in any case,
- * since puttuple assumes it always has a tuple to compare to).  We also
- * insist there be at least one free slot in the memtuples[] array.
+ * When alltuples = false and replacement selection is still active, dump
+ * only enough tuples to get under the availMem limit (and leave at least
+ * one tuple in memtuples, since puttuple will then assume it is a heap that
+ * has a tuple to compare to).  We also insist there be at least one free
+ * slot in the memtuples[] array.
  *
- * When alltuples = true, dump everything currently in memory.
+ * When alltuples = true, always batch dump everything currently in memory.
  * (This case is only used at end of input data.)
  *
- * If we empty the heap, close out the current run and return (this should
- * only happen at end of input data).  If we see that the tuple run number
- * at the top of the heap has changed, start a new run.
+ * If, when replacement selection is active, we see that the tuple run
+ * number at the top of the heap has changed, start a new run.  This must be
+ * the first run, because replacement selection is subsequently abandoned
+ * for all further runs.
  */
 static void
 dumptuples(Tuplesortstate *state, bool alltuples)
@@ -2627,21 +2917,39 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 		   (LACKMEM(state) && state->memtupcount > 1) ||
 		   state->memtupcount >= state->memtupsize)
 	{
+		if (state->replaceActive && !alltuples)
+		{
+			/*
+			 * Still holding out for a case favorable to replacement selection;
+			 * perhaps there will be a single-run in the event of almost sorted
+			 * input, or perhaps work_mem hasn't been exceeded by too much, and
+			 * a "quicksort with spillover" remains possible.
+			 *
+			 * Dump the heap's frontmost entry, and sift up to remove it from
+			 * the heap.
+			 */
+			Assert(state->memtupcount > 1);
+			WRITETUP(state, state->tp_tapenum[state->destTape],
+					 &state->memtuples[0]);
+			tuplesort_heap_siftup(state, true);
+		}
+		else
+		{
+			/*
+			 * Once committed to quicksorting runs, never incrementally
+			 * spill
+			 */
+			dumpbatch(state, alltuples);
+			break;
+		}
+
 		/*
-		 * Dump the heap's frontmost entry, and sift up to remove it from the
-		 * heap.
+		 * If top run number has changed, we've finished the current run
+		 * (this can only be the first run, run 0), and will no longer spill
+		 * incrementally.
 		 */
 		Assert(state->memtupcount > 0);
-		WRITETUP(state, state->tp_tapenum[state->destTape],
-				 &state->memtuples[0]);
-		tuplesort_heap_siftup(state, true);
-
-		/*
-		 * If the heap is empty *or* top run number has changed, we've
-		 * finished the current run.
-		 */
-		if (state->memtupcount == 0 ||
-			state->currentRun != state->memtuples[0].tupindex)
+		if (state->memtuples[0].tupindex != 0)
 		{
 			markrunend(state, state->tp_tapenum[state->destTape]);
 			state->currentRun++;
@@ -2650,24 +2958,94 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 
 #ifdef TRACE_SORT
 			if (trace_sort)
-				elog(LOG, "finished writing%s run %d to tape %d: %s",
-					 (state->memtupcount == 0) ? " final" : "",
+				elog(LOG, "finished writing heapsorted run %d to tape %d: %s",
 					 state->currentRun, state->destTape,
 					 pg_rusage_show(&state->ru_start));
 #endif
 
 			/*
-			 * Done if heap is empty, else prepare for new run.
+			 * Heap cannot be empty, so prepare for new run and give up on
+			 * replacement selection.
 			 */
-			if (state->memtupcount == 0)
-				break;
-			Assert(state->currentRun == state->memtuples[0].tupindex);
 			selectnewtape(state);
+			 /* All future runs will only use dumpbatch/quicksort */
+			state->replaceActive = false;
 		}
 	}
 }
 
 /*
+ * dumpbatch - sort and dump all memtuples, forming one run on tape
+ *
+ * Unlike classic replacement selection sort, second or subsequent runs are
+ * never heapified by this module (although heapification still respects run
+ * number differences between the first and second runs).  This helper
+ * handles the case where replacement selection is abandoned, and all tuples
+ * are quicksorted and dumped in memtuples-sized batches.  This alternative
+ * strategy is a simple hybrid sort-merge strategy, with quicksorting of
+ * memtuples-sized runs.
+ *
+ * In rare cases, this routine may add to an on-tape run already storing
+ * tuples.
+ */
+static void
+dumpbatch(Tuplesortstate *state, bool alltuples)
+{
+	int			memtupwrite;
+	int			i;
+
+	Assert(state->status == TSS_BUILDRUNS);
+
+	/* Final call might be unnecessary */
+	if (state->memtupcount == 0)
+	{
+		Assert(alltuples);
+		return;
+	}
+	state->currentRun++;
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "starting quicksort of run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	tuplesort_quicksort(state);
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished quicksorting run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	/*
+	 * This should be adopted to perform asynchronous I/O one day, as
+	 * dumping in batch represents a good opportunity to overlap I/O
+	 * and computation.
+	 */
+	memtupwrite = state->memtupcount;
+	for (i = 0; i < memtupwrite; i++)
+	{
+		WRITETUP(state, state->tp_tapenum[state->destTape],
+				 &state->memtuples[i]);
+		state->memtupcount--;
+	}
+	markrunend(state, state->tp_tapenum[state->destTape]);
+	state->tp_runs[state->destTape]++;
+	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished writing run %d to tape %d: %s",
+			 state->currentRun, state->destTape,
+			 pg_rusage_show(&state->ru_start));
+#endif
+
+	if (!alltuples)
+		selectnewtape(state);
+}
+
+/*
  * tuplesort_rescan		- rewind and replay the scan
  */
 void
@@ -2777,7 +3155,8 @@ void
 tuplesort_get_stats(Tuplesortstate *state,
 					const char **sortMethod,
 					const char **spaceType,
-					long *spaceUsed)
+					long *spaceUsed,
+					int *rowsSortedMem)
 {
 	/*
 	 * Note: it might seem we should provide both memory and disk usage for a
@@ -2806,15 +3185,23 @@ tuplesort_get_stats(Tuplesortstate *state,
 				*sortMethod = "top-N heapsort";
 			else
 				*sortMethod = "quicksort";
+			*rowsSortedMem = state->memtupcount;
 			break;
 		case TSS_SORTEDONTAPE:
 			*sortMethod = "external sort";
+			*rowsSortedMem = 0;
+			break;
+		case TSS_MEMTAPEMERGE:
+			*sortMethod = "quicksort with spillover";
+			*rowsSortedMem = state->memtupcount;
 			break;
 		case TSS_FINALMERGE:
 			*sortMethod = "external merge";
+			*rowsSortedMem = 0;
 			break;
 		default:
 			*sortMethod = "still in progress";
+			*rowsSortedMem = -1;
 			break;
 	}
 }
@@ -2825,10 +3212,19 @@ tuplesort_get_stats(Tuplesortstate *state,
  *
  * Compare two SortTuples.  If checkIndex is true, use the tuple index
  * as the front of the sort key; otherwise, no.
+ *
+ * Note that for checkIndex callers, the heap invariant is never maintained
+ * beyond the first run, and so there are no COMPARETUP() calls beyond the
+ * first run.  It is assumed that checkIndex callers are maintaining the
+ * heap invariant for a replacement selection priority queue, but those
+ * callers do not go on to trust the heap to be fully-heapified past the
+ * first run.  Once currentRun isn't the first, memtuples is no longer a
+ * heap at all.
  */
 
 #define HEAPCOMPARE(tup1,tup2) \
-	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \
+	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
+					(tup1)->tupindex != 0) ? \
 	 ((tup1)->tupindex) - ((tup2)->tupindex) : \
 	 COMPARETUP(state, tup1, tup2))
 
@@ -2927,6 +3323,33 @@ sort_bounded_heap(Tuplesortstate *state)
 }
 
 /*
+ * Sort all memtuples using quicksort.
+ *
+ * Quicksort is tuplesort's internal sort algorithm.  It is also generally
+ * preferred to replacement selection of runs during external sorts, except
+ * where incrementally spilling may be particularly beneficial.  Quicksort
+ * will generally be much faster than replacement selection's heapsort
+ * because modern CPUs are usually bottlenecked on memory access, and
+ * quicksort is a cache-oblivious algorithm.
+ */
+static void
+tuplesort_quicksort(Tuplesortstate *state)
+{
+	if (state->memtupcount > 1)
+	{
+		/* Can we use the single-key sort function? */
+		if (state->onlyKey != NULL)
+			qsort_ssup(state->memtuples, state->memtupcount,
+					   state->onlyKey);
+		else
+			qsort_tuple(state->memtuples,
+						state->memtupcount,
+						state->comparetup,
+						state);
+	}
+}
+
+/*
  * Insert a new tuple into an empty or existing heap, maintaining the
  * heap invariant.  Caller is responsible for ensuring there's room.
  *
@@ -2954,6 +3377,17 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 	memtuples = state->memtuples;
 	Assert(state->memtupcount < state->memtupsize);
 
+	/*
+	 * Once incremental heap spilling is abandoned, this routine should not be
+	 * called when loading runs.  memtuples will be an array of tuples in no
+	 * significant order, so calling here is inappropriate.  Even when
+	 * incremental spilling is still in progress, this routine does not handle
+	 * the second run's tuples (those are heapified to a limited extent that
+	 * they are appended, and thus kept away from those tuples in the first
+	 * run).
+	 */
+	Assert(!checkIndex || tupleindex == 0);
+
 	CHECK_FOR_INTERRUPTS();
 
 	/*
@@ -2985,6 +3419,13 @@ tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
 	int			i,
 				n;
 
+	/*
+	 * Once incremental heap spilling is abandoned, this routine should not be
+	 * called when loading runs.  memtuples will be an array of tuples in no
+	 * significant order, so calling here is inappropriate.
+	 */
+	Assert(!checkIndex || state->currentRun == 0);
+
 	if (--state->memtupcount <= 0)
 		return;
 
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index de6fc56..3679815 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -109,7 +109,8 @@ extern void tuplesort_end(Tuplesortstate *state);
 extern void tuplesort_get_stats(Tuplesortstate *state,
 					const char **sortMethod,
 					const char **spaceType,
-					long *spaceUsed);
+					long *spaceUsed,
+					int *rowsSortedMem);
 
 extern int	tuplesort_merge_order(int64 allowedMem);
 
-- 
1.9.1