diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 8b520c1..8de3dbd 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -464,6 +464,7 @@ static void sort_bounded_heap(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 					  int tupleindex, bool checkIndex);
 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
+static void tuplesort_heap_siftdown(Tuplesortstate *state, int start);
 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 static void markrunend(Tuplesortstate *state, int tapenum);
 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
@@ -1205,13 +1206,16 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 			 * still one free slot remaining --- if we fail, there'll still be
 			 * room to store the incoming tuple, and then we'll switch to
 			 * tape-based operation.
+			 * Set the tape-number to zero here as it makes the code cleaner
+			 * for converting from internal to external sort.
 			 */
 			if (state->memtupcount >= state->memtupsize - 1)
 			{
 				(void) grow_memtuples(state);
 				Assert(state->memtupcount < state->memtupsize);
 			}
-			state->memtuples[state->memtupcount++] = *tuple;
+			state->memtuples[state->memtupcount] = *tuple;
+			state->memtuples[state->memtupcount++].tupindex = 0;
 
 			/*
 			 * Check if it's time to switch over to a bounded heapsort. We do
@@ -1826,7 +1830,6 @@ static void
 inittapes(Tuplesortstate *state)
 {
 	int			maxTapes,
-				ntuples,
 				j;
 	int64		tapeSpace;
 
@@ -1884,23 +1887,15 @@ inittapes(Tuplesortstate *state)
 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
 
 	/*
-	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
-	 * marked as belonging to run number zero.
-	 *
-	 * NOTE: we pass false for checkIndex since there's no point in comparing
-	 * indexes in this step, even though we do intend the indexes to be part
-	 * of the sort key...
+	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple was
+	 * marked as belonging to run number zero on input; we don't compare that.
+	 * We comb the array starting at the last heap parent and working back to the
+	 * start; for each such item we sift it down to its proper heap position.
+	 * After each iteration the start slot and all heap slots below make a
+	 * well-formed heap.
 	 */
-	ntuples = state->memtupcount;
-	state->memtupcount = 0;		/* make the heap empty */
-	for (j = 0; j < ntuples; j++)
-	{
-		/* Must copy source tuple to avoid possible overwrite */
-		SortTuple	stup = state->memtuples[j];
-
-		tuplesort_heap_insert(state, &stup, 0, false);
-	}
-	Assert(state->memtupcount == ntuples);
+	for (j = (state->memtupcount-2)/2; j >= 0; j--)
+		tuplesort_heap_siftdown(state, j);
 
 	state->currentRun = 0;
 
@@ -2749,6 +2744,55 @@ tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
 	memtuples[i] = *tuple;
 }
 
+/*
+ * The tuple at state->memtuples[start] is being added in the heap,
+ * which is well-formed for all larger indices.  Return with the
+ * heap well-formed for this and all larger indices.
+ *
+ * Working down from the start point: follow the smallest-child path
+ * down to the leaf, then back up until the new item is larger. Insert
+ * here, promoting all children one layer as far as the start slot.
+ */
+static inline void
+tuplesort_heap_siftdown(Tuplesortstate *state, int start)
+{
+	SortTuple * m = state->memtuples;
+	int ntup = state->memtupcount;
+	int par = start, child;
+	SortTuple tup, tup2;
+
+  /* Work down the smallest-child path to the leaf */
+	while ((child = par*2 + 1) < ntup)
+		par = child+1 < ntup  &&  COMPARETUP(state, &m[child], &m[child+1]) > 0
+				? child+1 : child;
+	child = par;
+
+	/* Work up until new item is larger or equal */
+	/*XXX Do we want early or late stop on equal? 
+			Early is fewer compares but more data-moves.
+			Late is fewer data-moves but more compares.
+			Assume compares are expensive, so early is best.
+	*/
+	while (child > start && COMPARETUP(state, &m[start], &m[child]) < 0)
+		child = (child-1)/2;
+
+	/* Insert here */
+	if (child != start)
+	{
+		tup = m[child];
+		m[child] = m[start];
+
+		while ((child = (child-1)/2) > start)
+		{
+			tup2 = m[child];
+			m[child] = tup;
+			tup = tup2;
+		}
+
+		m[start] = tup;
+	}
+}
+
 
 /*
  * Tape interface routines
