From 71dcd1fc8acf8e54478526bd0a7feaf5e43bf2c1 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 9 Aug 2016 14:53:49 -0700
Subject: [PATCH 3/6] Displace heap's root during tuplesort merge

Directly displace and shift down.  This is significantly faster than
shifting down to compact the heap, and then shifting up to insert a new
preload tuple during tuplesort merging, especially when the merge can
naturally return at least a few tuples from each tape (return them as
sorted output to caller) before some other tape needs to return tuples.
This is fairly common in the real world; tuple attribute values are
often found to be physically clustered together on input into
tuplesort.c.
---
 src/backend/utils/sort/tuplesort.c | 110 +++++++++++++++++++++++++++++++++----
 1 file changed, 100 insertions(+), 10 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index c80e5a1..35d7cb1 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -68,9 +68,9 @@
  * (or two, when replacement selection is still used), then merge the runs
  * using Algorithm D.
  *
- * When merging runs, we use a heap containing just the frontmost tuple from
- * each source run; we repeatedly output the smallest tuple and insert the
- * next tuple from its source tape (if any).  When the heap empties, the merge
+ * When merging runs, we use a minheap containing only frontmost tuples from
+ * each source run; on output of the root tuple, it is displaced by the next
+ * tuple from its source tape (if any).  When the heap empties, the merge
  * is complete.  The basic merge algorithm thus needs very little memory ---
  * only M tuples for an M-way merge, and M is constrained to a small number.
  * However, we can still make good use of our full workMem allocation by
@@ -572,6 +572,7 @@ static void sort_bounded_heap(Tuplesortstate *state);
 static void tuplesort_sort_memtuples(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 					  int tupleindex, bool checkIndex);
+static void tuplesort_heap_root_displace(Tuplesortstate *state, SortTuple *newtup);
 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
 static void reversedirection(Tuplesortstate *state);
 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
@@ -1993,7 +1994,6 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * more generally.
 				 */
 				*stup = state->memtuples[0];
-				tuplesort_heap_siftup(state, false);
 				if ((tupIndex = state->mergenext[srcTape]) == 0)
 				{
 					/*
@@ -2014,18 +2014,21 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 					 */
 					if ((tupIndex = state->mergenext[srcTape]) == 0)
 					{
+						/* compact the heap */
+						tuplesort_heap_siftup(state, false);
 						/* Free tape's buffer, avoiding dangling pointer */
 						if (state->batchUsed)
 							mergebatchfreetape(state, srcTape, stup, should_free);
 						return true;
 					}
 				}
-				/* pull next preread tuple from list, insert in heap */
+				/* pull next preread tuple from list, displace root in heap */
 				newtup = &state->memtuples[tupIndex];
 				state->mergenext[srcTape] = newtup->tupindex;
 				if (state->mergenext[srcTape] == 0)
 					state->mergelast[srcTape] = 0;
-				tuplesort_heap_insert(state, newtup, srcTape, false);
+				newtup->tupindex = srcTape;
+				tuplesort_heap_root_displace(state, newtup);
 				/* put the now-unused memtuples entry on the freelist */
 				newtup->tupindex = state->mergefreelist;
 				state->mergefreelist = tupIndex;
@@ -2714,8 +2717,6 @@ mergeonerun(Tuplesortstate *state, int destTape, bool finalMerge)
 		 */
 		spaceFreed = state->availMem - priorAvail;
 		state->mergeavailmem[srcTape] += spaceFreed;
-		/* compact the heap */
-		tuplesort_heap_siftup(state, false);
 		if ((tupIndex = state->mergenext[srcTape]) == 0)
 		{
 			/* out of preloaded data on this tape, try to read more */
@@ -2759,18 +2760,21 @@ mergeonerun(Tuplesortstate *state, int destTape, bool finalMerge)
 			/* if still no data, we've reached end of run on this tape */
 			if ((tupIndex = state->mergenext[srcTape]) == 0)
 			{
+				/* compact the heap */
+				tuplesort_heap_siftup(state, false);
 				/* Free tape's buffer */
 				if (state->batchUsed)
 					mergebatchfreetape(state, srcTape, NULL, NULL);
 				continue;
 			}
 		}
-		/* pull next preread tuple from list, insert in heap */
+		/* pull next preread tuple from list, displace root in heap */
 		tup = &state->memtuples[tupIndex];
 		state->mergenext[srcTape] = tup->tupindex;
 		if (state->mergenext[srcTape] == 0)
 			state->mergelast[srcTape] = 0;
-		tuplesort_heap_insert(state, tup, srcTape, false);
+		tup->tupindex = srcTape;
+		tuplesort_heap_root_displace(state, tup);
 		/* put the now-unused memtuples entry on the freelist */
 		tup->tupindex = state->mergefreelist;
 		state->mergefreelist = tupIndex;
@@ -3909,6 +3913,92 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 
 /*
  * The tuple at state->memtuples[0] has been removed from the heap.
+ * Shift down to find new location for next preread tuple from the tape
+ * that old root originated from.  New tuple's tupindex must be
+ * initialized to its source tape number by caller.
+ *
+ * This routine is only used during merging.  It is more efficient to
+ * displace the hole at the root than to compact and then insert into
+ * the heap.  We prefer to not alter the size of the heap until it's
+ * truly necessary (until a tape is exhausted), and shift down using
+ * the caller's tuple directly (not the existing heap's last tuple),
+ * avoiding a separate insertion step entirely.
+ *
+ * When the "tape at the root of the heap" does not change, then only
+ * two comparisons are required, as well as one swap.  (Actually, that's
+ * the maximum cost per subheap/iteration in general.)
+ */
+static void
+tuplesort_heap_root_displace(Tuplesortstate *state, SortTuple *newtup)
+{
+	SortTuple  *memtuples = state->memtuples;
+	int 		n,
+				imin,
+				i;
+
+	Assert(memtuples[0].tupindex == newtup->tupindex);
+
+	CHECK_FOR_INTERRUPTS();
+
+	n = state->memtupcount;			/* n is heap's size, including old root */
+	imin = 0;						/* start with caller's "hole" in root */
+	i = imin;
+	/* Shift down as required */
+	for (;;)
+	{
+		int		j = i * 2 + 1;
+
+		if (j >= n)
+			break;
+
+		/*
+		 * Test newtup as candidate for new root of the (sub)heap whose
+		 * root is i.  (The i position is always a hole left either by
+		 * caller, or by a previous iteration here.)
+		 *
+		 * Determine which node among left child, right child, and
+		 * newtup-as-root has the minimum value (and as such should be
+		 * swapped into i's position).
+		 */
+		Assert(memtuples[j].tupindex != newtup->tupindex);
+		if (COMPARETUP(state, newtup, &memtuples[j]) <= 0)
+		{
+			/*
+			 * Break when new tuple "fits" as root for (sub)heap (when i
+			 * is equal to imin, and should remain equal)
+			 */
+			if (j + 1 < n &&
+				COMPARETUP(state, newtup, &memtuples[j + 1]) > 0)
+				imin = j + 1;
+			else
+				break;
+		}
+		else
+		{
+			if (j + 1 < n &&
+				COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
+				imin = j + 1;
+			else
+				imin = j;
+		}
+
+		/*
+		 * Fill hole at root of (sub)heap by swapping.  Either left
+		 * child or right child of the (sub)heap considered in this
+		 * iteration is swapped in as new root here.  Next iteration
+		 * fills the new hole this creates.  Last iteration is one that
+		 * finds place for caller's new tuple, leaving no hole.
+		 */
+		memtuples[i] = memtuples[imin];
+		i = imin;
+	}
+
+	Assert(state->memtupcount > 1 || imin == 0);
+	memtuples[imin] = *newtup;
+}
+
+/*
+ * The tuple at state->memtuples[0] has been removed from the heap.
  * Decrement memtupcount, and sift up to maintain the heap invariant.
  */
 static void
-- 
2.7.4

