From 22e3aa14436d3c4ca8ccde2cc55777c6207fd9f0 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Thu, 16 Jul 2015 22:26:01 -0700
Subject: [PATCH] Cache-aware index sort comparator tie-breakers

For the B-Tree and hash index build sort cases only, tuplesort
previously performed a tie-breaker on ItemPointer (heap physical order).
This may later provide some performance benefits for index scans.
However, it implied significant overhead during index builds, because
additional pointer chasing (to the IndexTuple "tuple proper") was
required in the comparators, which adds significant memory latency where
the "tuple proper" does not otherwise need to be accessed (e.g. single
pass-by-value attribute B-Tree sorts).

To accomplish the same outcome inexpensively, ensure stable sort output
in the traditional way for quicksort:  tie-break by comparing a simple
ordinal number (reuse the field SortTuple.tupindex for this, since when
comparing it's already bound to be in L1 cache).  A "tuple proper"
tie-breaker is therefore not required.  However, since
SortTuple.tupindex is already used by external sorts, it cannot be
repurposed there, so external sorts still tie-break the expensive way.

This formalizes a requirement for index build tuplesort clients that
they've always met:  that tuples are supplied to tuplesort in the
original physical/heap order.  It seems acceptable that this is somewhat
undermined by synchronized heap scans.  Someday, we might be able to
remove all tie-breakers for external sorts, if tape sort can be made to
be a stable sort (as many merge sort implementations are).

In passing, move a related assertion to a slightly different, more
appropriate place.
---
 src/backend/utils/sort/tuplesort.c | 73 +++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 435041a..ab69320 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -160,11 +160,15 @@ bool		optimize_bounded_sort = true;
  * described above.  Accordingly, "tuple" is always used in preference to
  * datum1 as the authoritative value for pass-by-reference cases.
  *
- * While building initial runs, tupindex holds the tuple's run number.  During
- * merge passes, we re-use it to hold the input tape number that each tuple in
- * the heap was read from, or to hold the index of the next tuple pre-read
- * from the same tape in the case of pre-read entries.  tupindex goes unused
- * if the sort occurs entirely in memory.
+ * tupindex is initially used as an ordinal identifier;  in other words, it
+ * tracks the original physical position of the tuple.  This is useful for
+ * index sorts that are ultimate performed entirely in memory, since equal
+ * IndexTuples can remain in physical order by just comparing tupindex.  If
+ * the sort does not fit in memory, then, while building initial runs,
+ * tupindex holds the tuple's run number.  During merge passes, we re-use it
+ * to hold the input tape number that each tuple in the heap was read from, or
+ * to hold the index of the next tuple pre-read from the same tape in the case
+ * of pre-read entries.
  */
 typedef struct
 {
@@ -1237,6 +1241,14 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 /*
  * Collect one index tuple while collecting input data for sort, building
  * it from caller-supplied values.
+ *
+ * Note that there is an assumption that callers provide values from heap
+ * tuples in physical order.  This makes maintaining that order among equal
+ * tuples inexpensive.  Synchronized heap scans performed by callers might
+ * undermine the assumption.  This isn't worth guarding against, since the
+ * assumption exists only for performance reasons, and the resulting index
+ * will have almost physical ordering among equal tuples, which is almost
+ * as good.
  */
 void
 tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
@@ -1405,7 +1417,9 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 				(void) grow_memtuples(state);
 				Assert(state->memtupcount < state->memtupsize);
 			}
-			state->memtuples[state->memtupcount++] = *tuple;
+			/* When we still fit in memory, tupindex is ordinal tuple # */
+			tuple->tupindex = state->memtupcount++;
+			state->memtuples[tuple->tupindex] = *tuple;
 
 			/*
 			 * Check if it's time to switch over to a bounded heapsort. We do
@@ -3553,6 +3567,13 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	}
 
 	/*
+	 * Some rather brain-dead implementations of qsort (such as the one in QNX
+	 * 4) will sometimes call the comparison routine to compare a value to
+	 * itself, but we always use our own implementation, which does not.
+	 */
+	Assert(tuple1 != tuple2);
+
+	/*
 	 * If btree has asked us to enforce uniqueness, complain if two equal
 	 * tuples are detected (unless there was at least one NULL field).
 	 *
@@ -3567,14 +3588,6 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 		bool		isnull[INDEX_MAX_KEYS];
 		char	   *key_desc;
 
-		/*
-		 * Some rather brain-dead implementations of qsort (such as the one in
-		 * QNX 4) will sometimes call the comparison routine to compare a
-		 * value to itself, but we always use our own implementation, which
-		 * does not.
-		 */
-		Assert(tuple1 != tuple2);
-
 		index_deform_tuple(tuple1, tupDes, values, isnull);
 
 		key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
@@ -3590,10 +3603,20 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	}
 
 	/*
-	 * If key values are equal, we sort on ItemPointer.  This does not affect
-	 * validity of the finished index, but it may be useful to have index
-	 * scans in physical order.
+	 * If key values are equal and we are doing an internal sort, we sort on
+	 * tupindex, which is a simple ordinal tuple number (when status is
+	 * TSS_INITIAL).  This does not affect validity of the finished index, but
+	 * it may be useful to have index scans in physical order.
+	 *
+	 * For external sorts, we sort on ItemPointer.  This is necessary because
+	 * even tape sort is not a stable sort, and external sorts use tupindex
+	 * for another purpose.  It's worth avoiding pointer chasing where
+	 * possible, since sorting is usually bottlenecked on memory latency, and
+	 * tuple1 and tuple2 may not be cache resident.
 	 */
+	if (state->status == TSS_INITIAL)
+		return (a->tupindex < b->tupindex) ? -1 : 1;
+
 	{
 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
@@ -3636,10 +3659,20 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 		return -1;
 
 	/*
-	 * If hash values are equal, we sort on ItemPointer.  This does not affect
-	 * validity of the finished index, but it may be useful to have index
-	 * scans in physical order.
+	 * If hash values are equal and we are doing an internal sort, we sort on
+	 * tupindex, which is a simple ordinal tuple number (when status is
+	 * TSS_INITIAL).  This does not affect validity of the finished index, but
+	 * it may be useful to have index scans in physical order.
+	 *
+	 * For external sorts, we sort on ItemPointer.  This is necessary because
+	 * even tape sort is not a stable sort, and external sorts use tupindex
+	 * for another purpose.  It's worth avoiding pointer chasing where
+	 * possible, since sorting is usually bottlenecked on memory latency, and
+	 * tuple1 and tuple2 are unlikely to be cache resident.
 	 */
+	if (state->status == TSS_INITIAL)
+		return (a->tupindex < b->tupindex) ? -1 : 1;
+
 	tuple1 = (IndexTuple) a->tuple;
 	tuple2 = (IndexTuple) b->tuple;
 
-- 
1.9.1