diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
new file mode 100644
index 245b224..8491d58
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
*************** heapgettup_pagemode(HeapScanDesc scan,
*** 833,839 ****
  #if defined(DISABLE_COMPLEX_MACRO)
  /*
   * This is formatted so oddly so that the correspondence to the macro
!  * definition in access/heapam.h is maintained.
   */
  Datum
  fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
--- 833,839 ----
  #if defined(DISABLE_COMPLEX_MACRO)
  /*
   * This is formatted so oddly so that the correspondence to the macro
!  * definition in access/htup.h is maintained.
   */
  Datum
  fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index f86b755..4c7cb00
*** a/src/backend/access/nbtree/nbtcompare.c
--- b/src/backend/access/nbtree/nbtcompare.c
*************** btint4cmp(PG_FUNCTION_ARGS)
*** 102,108 ****
  		PG_RETURN_INT32(-1);
  }
  
! static int
  btint4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int32		a = DatumGetInt32(x);
--- 102,110 ----
  		PG_RETURN_INT32(-1);
  }
  
! #ifndef USE_INLINE
! 
! int
  btint4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int32		a = DatumGetInt32(x);
*************** btint4fastcmp(Datum x, Datum y, SortSupp
*** 116,121 ****
--- 118,125 ----
  		return -1;
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btint4sortsupport(PG_FUNCTION_ARGS)
  {
*************** btint8cmp(PG_FUNCTION_ARGS)
*** 139,145 ****
  		PG_RETURN_INT32(-1);
  }
  
! static int
  btint8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int64		a = DatumGetInt64(x);
--- 143,151 ----
  		PG_RETURN_INT32(-1);
  }
  
! #ifndef USE_INLINE
! 
! int
  btint8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int64		a = DatumGetInt64(x);
*************** btint8fastcmp(Datum x, Datum y, SortSupp
*** 153,158 ****
--- 159,166 ----
  		return -1;
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btint8sortsupport(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
new file mode 100644
index 63b09a4..b1086eb
*** a/src/backend/utils/adt/float.c
--- b/src/backend/utils/adt/float.c
*************** do {															\
*** 67,76 ****
  /* Configurable GUC parameter */
  int			extra_float_digits = 0;		/* Added to DBL_DIG or FLT_DIG */
  
- 
- static int	float4_cmp_internal(float4 a, float4 b);
- static int	float8_cmp_internal(float8 a, float8 b);
- 
  #ifndef HAVE_CBRT
  /*
   * Some machines (in particular, some versions of AIX) have an extern
--- 67,72 ----
*************** float8div(PG_FUNCTION_ARGS)
*** 844,850 ****
  /*
   *		float4{eq,ne,lt,le,gt,ge}		- float4/float4 comparison operations
   */
! static int
  float4_cmp_internal(float4 a, float4 b)
  {
  	/*
--- 840,848 ----
  /*
   *		float4{eq,ne,lt,le,gt,ge}		- float4/float4 comparison operations
   */
! #ifndef USE_INLINE
! 
! int
  float4_cmp_internal(float4 a, float4 b)
  {
  	/*
*************** float4_cmp_internal(float4 a, float4 b)
*** 874,879 ****
--- 872,879 ----
  	}
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  float4eq(PG_FUNCTION_ARGS)
  {
*************** btfloat4cmp(PG_FUNCTION_ARGS)
*** 937,943 ****
  	PG_RETURN_INT32(float4_cmp_internal(arg1, arg2));
  }
  
! static int
  btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float4		arg1 = DatumGetFloat4(x);
--- 937,946 ----
  	PG_RETURN_INT32(float4_cmp_internal(arg1, arg2));
  }
  
! 
! #ifndef USE_INLINE
! 
! int
  btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float4		arg1 = DatumGetFloat4(x);
*************** btfloat4fastcmp(Datum x, Datum y, SortSu
*** 946,951 ****
--- 949,956 ----
  	return float4_cmp_internal(arg1, arg2);
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btfloat4sortsupport(PG_FUNCTION_ARGS)
  {
*************** btfloat4sortsupport(PG_FUNCTION_ARGS)
*** 958,964 ****
  /*
   *		float8{eq,ne,lt,le,gt,ge}		- float8/float8 comparison operations
   */
! static int
  float8_cmp_internal(float8 a, float8 b)
  {
  	/*
--- 963,972 ----
  /*
   *		float8{eq,ne,lt,le,gt,ge}		- float8/float8 comparison operations
   */
! 
! #ifndef USE_INLINE
! 
! int
  float8_cmp_internal(float8 a, float8 b)
  {
  	/*
*************** float8_cmp_internal(float8 a, float8 b)
*** 988,993 ****
--- 996,1003 ----
  	}
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  float8eq(PG_FUNCTION_ARGS)
  {
*************** btfloat8cmp(PG_FUNCTION_ARGS)
*** 1051,1057 ****
  	PG_RETURN_INT32(float8_cmp_internal(arg1, arg2));
  }
  
! static int
  btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float8		arg1 = DatumGetFloat8(x);
--- 1061,1069 ----
  	PG_RETURN_INT32(float8_cmp_internal(arg1, arg2));
  }
  
! #ifndef USE_INLINE
! 
! int
  btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float8		arg1 = DatumGetFloat8(x);
*************** btfloat8fastcmp(Datum x, Datum y, SortSu
*** 1060,1065 ****
--- 1072,1079 ----
  	return float8_cmp_internal(arg1, arg2);
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btfloat8sortsupport(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/sort/sortsupport.c b/src/backend/utils/sort/sortsupport.c
new file mode 100644
index 4a87f11..d5bee30
*** a/src/backend/utils/sort/sortsupport.c
--- b/src/backend/utils/sort/sortsupport.c
***************
*** 16,21 ****
--- 16,22 ----
  #include "postgres.h"
  
  #include "fmgr.h"
+ #include "utils/fmgroids.h"
  #include "utils/lsyscache.h"
  #include "utils/sortsupport.h"
  
*************** PrepareSortSupportComparisonShim(Oid cmp
*** 127,132 ****
--- 128,177 ----
  }
  
  /*
+  * Given a pgproc sort support function, see if it's safe to sort using a
+  * hard-coded generic comparator for scalar types, to faciliate an optimization
+  * that may later be used within tuplesort.c.
+  *
+  * This is actually technically quite orthogonal to the user-exposed API for
+  * providing a directly callable comparator, and eliding the SQL function call
+  * machinery through that API may separately improve performance for the same
+  * sort under certain circumstances. However, it is reasonable to map built-in
+  * sort support functions to a corresponding hard coded comparator, because it
+  * is always expected that the types that this optimization will be used with
+  * will be a subset of types with a built-in sort support function - if it
+  * wasn't worth doing the former, it certainly won't be worth doing the latter.
+  * It is also reasonable because the fast comparator variants are, as a matter
+  * of policy, directly used for this optimization, so ipso facto a
+  * corresponding sort support function must be available.
+  */
+ TypeCompar
+ ResolveComparatorProper(Oid Proc)
+ {
+ 	switch (Proc)
+ 	{
+ 		/* Some comparison that has an underlying int4 representation */
+ 		case F_BTINT4SORTSUPPORT:
+ 			return TYPE_COMP_INT4;
+ 		case F_DATE_SORTSUPPORT:
+ 			return TYPE_COMP_INT4;
+ 		/* Some comparison that has an underlying int8 representation */
+ 		case F_BTINT8SORTSUPPORT:
+ 			return TYPE_COMP_INT8;
+ #ifdef HAVE_INT64_TIMESTAMP
+ 		case F_TIMESTAMP_SORTSUPPORT:
+ 			return TYPE_COMP_INT8;
+ #endif
+ 		/* floating point types */
+ 		case F_BTFLOAT4SORTSUPPORT:
+ 			return TYPE_COMP_FLOAT4;
+ 		case F_BTFLOAT8SORTSUPPORT:
+ 			return TYPE_COMP_FLOAT8;
+ 		default:
+ 			return TYPE_COMP_OTHER;
+ 	}
+ }
+ 
+ /*
   * Fill in SortSupport given an ordering operator (btree "<" or ">" operator).
   *
   * Caller must previously have zeroed the SortSupportData structure and then
*************** PrepareSortSupportFromOrderingOp(Oid ord
*** 157,160 ****
--- 202,210 ----
  		/* We'll use a shim to call the old-style btree comparator */
  		PrepareSortSupportComparisonShim(sortFunction, ssup);
  	}
+ 	/*
+ 	 * We may later avail of a further optimization for a few built-in scalar
+ 	 * types: inlining of the comparator proper.
+ 	 */
+ 	ssup->usable_compar = ResolveComparatorProper(sortFunction);
  }
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 28d585f..2676c9f
*** a/src/backend/utils/sort/tuplesort.c
--- b/src/backend/utils/sort/tuplesort.c
***************
*** 106,111 ****
--- 106,112 ----
  #include "executor/executor.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
+ #include "utils/builtins.h"
  #include "utils/datum.h"
  #include "utils/logtape.h"
  #include "utils/lsyscache.h"
***************
*** 113,118 ****
--- 114,120 ----
  #include "utils/pg_rusage.h"
  #include "utils/rel.h"
  #include "utils/sortsupport.h"
+ #include "utils/template_qsort_arg.h"
  #include "utils/tuplesort.h"
  
  
*************** static void tuplesort_heap_insert(Tuples
*** 456,461 ****
--- 458,464 ----
  static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
  static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
  static void markrunend(Tuplesortstate *state, int tapenum);
+ static inline int comparetup_inline(void *a, void *b, Tuplesortstate *state);
  static int comparetup_heap(const SortTuple *a, const SortTuple *b,
  				Tuplesortstate *state);
  static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
*************** static void readtup_datum(Tuplesortstate
*** 492,497 ****
--- 495,537 ----
  static void reversedirection_datum(Tuplesortstate *state);
  static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  
+ /*
+  * Manufacture type specific sorting specialisations with inline comparators.
+  *
+  * Use fast comparator functions for "comparator proper" in each case. They're
+  * used via function pointers to elide SQL-function-call overhead when sorting,
+  * but that doesn't provide the additional benefits of inlining and other
+  * optimizations from compile-time comparator knowledge.
+  */
+ #define int4_shim(x,y)			btint4fastcmp(x, y, NULL)
+ #define int8_shim(x,y)			btint8fastcmp(x, y, NULL)
+ #define float4_shim(x,y)		btfloat4fastcmp(x, y, NULL)
+ #define float8_shim(x,y)		btfloat8fastcmp(x, y, NULL)
+ 
+ #define noop_cmp_proper(nop1, nop2) 0
+ #define noop_code (void)0;
+ 
+ /* Instantiate sorting specializations (heap) */
+ TEMPLATE_QSORT_ARG_HEAP(int4, int4_shim);
+ TEMPLATE_QSORT_ARG_HEAP(int8, int8_shim);
+ TEMPLATE_QSORT_ARG_HEAP(float4, float4_shim);
+ TEMPLATE_QSORT_ARG_HEAP(float8, float8_shim);
+ 
+ /* Instantiate sorting specializations (btree index) */
+ TEMPLATE_QSORT_ARG_BTREE(int4, int4_shim);
+ TEMPLATE_QSORT_ARG_BTREE(int8, int8_shim);
+ TEMPLATE_QSORT_ARG_BTREE(float4, float4_shim);
+ TEMPLATE_QSORT_ARG_BTREE(float8, float8_shim);
+ 
+ /*
+  * A semi-generic specialization, for single sort-key sorts not covered by
+  * other specializations. Note that this specialization is used for sorting
+  * both heap and index tuples that meet that criteria.
+  *
+  * This results in dead code, but since the relevant functions are static, they
+  * will be discarded by compiler optimization.
+  */
+ DO_TEMPLATE_QSORT_ARG(all, noop_cmp_proper, sing, noop_code, comparetup_inline);
  
  /*
   *		tuplesort_begin_xxx
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 631,636 ****
--- 671,704 ----
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
+ 	if (state->sortKeys)
+ 	{
+ 		switch(state->sortKeys->usable_compar)
+ 		{
+ 			case TYPE_COMP_INT4:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?int4_inlheap_qsort_arg:int4_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_INT8:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?int8_inlheap_qsort_arg:int8_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_FLOAT4:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?float4_inlheap_qsort_arg:float4_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_FLOAT8:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?float8_inlheap_qsort_arg:float8_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_OTHER:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?all_sing_qsort_arg:NULL;
+ 				break;
+ 			default:
+ 				elog(ERROR, "unrecognized comparator type for heap tuplesort");
+ 		}
+ 	}
  	MemoryContextSwitchTo(oldcontext);
  
  	return state;
*************** tuplesort_begin_index_btree(Relation ind
*** 733,738 ****
--- 801,876 ----
  	state->indexScanKey = _bt_mkscankey_nodata(indexRel);
  	state->enforceUnique = enforceUnique;
  
+ 	if (state->sortKeys)
+ 	{
+ 		if ( state->nKeys == 1 && !state->enforceUnique)
+ 		{
+ 			switch(state->sortKeys->usable_compar)
+ 			{
+ 				/*
+ 				 * Note that "heap" variants are used for single-key sorts, because
+ 				 * they'll work equally well for btree sorts under this exact
+ 				 * set of circumstances.
+ 				 */
+ 				case TYPE_COMP_INT4:
+ 					state->sortKeys->qsort_arg_spec = int4_inlheap_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_INT8:
+ 					state->sortKeys->qsort_arg_spec = int8_inlheap_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT4:
+ 					state->sortKeys->qsort_arg_spec = float4_inlheap_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT8:
+ 					state->sortKeys->qsort_arg_spec = float8_inlheap_qsort_arg;
+ 					break;
+ 				default:
+ 					state->sortKeys->qsort_arg_spec = all_sing_qsort_arg;
+ 					break;
+ 			}
+ 		}
+ 		else if ( state->nKeys == 1 && state->enforceUnique)
+ 		{
+ 			switch(state->sortKeys->usable_compar)
+ 			{
+ 				case TYPE_COMP_INT4:
+ 					state->sortKeys->qsort_arg_spec = int4_regbtreeuniq_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_INT8:
+ 					state->sortKeys->qsort_arg_spec = int8_regbtreeuniq_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT4:
+ 					state->sortKeys->qsort_arg_spec = float4_regbtreeuniq_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT8:
+ 					state->sortKeys->qsort_arg_spec = float8_regbtreeuniq_qsort_arg;
+ 					break;
+ 				default:
+ 					state->sortKeys->qsort_arg_spec = NULL;
+ 			}
+ 		}
+ 		else
+ 		{
+ 			switch(state->sortKeys->usable_compar)
+ 			{
+ 				case TYPE_COMP_INT4:
+ 					state->sortKeys->qsort_arg_spec = int4_regbtree_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_INT8:
+ 					state->sortKeys->qsort_arg_spec = int8_regbtree_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT4:
+ 					state->sortKeys->qsort_arg_spec = float4_regbtree_qsort_arg;
+ 					break;
+ 				case TYPE_COMP_FLOAT8:
+ 					state->sortKeys->qsort_arg_spec = float8_regbtree_qsort_arg;
+ 					break;
+ 				default:
+ 					state->sortKeys->qsort_arg_spec = NULL;
+ 			}
+ 		}
+ 	}
+ 
  	MemoryContextSwitchTo(oldcontext);
  
  	return state;
*************** tuplesort_performsort(Tuplesortstate *st
*** 1221,1232 ****
  			 * We were able to accumulate all the tuples within the allowed
  			 * amount of memory.  Just qsort 'em and we're done.
  			 */
  			if (state->memtupcount > 1)
! 				qsort_arg((void *) state->memtuples,
! 						  state->memtupcount,
! 						  sizeof(SortTuple),
! 						  (qsort_arg_comparator) state->comparetup,
! 						  (void *) state);
  			state->current = 0;
  			state->eof_reached = false;
  			state->markpos_offset = 0;
--- 1359,1391 ----
  			 * We were able to accumulate all the tuples within the allowed
  			 * amount of memory.  Just qsort 'em and we're done.
  			 */
+ 
  			if (state->memtupcount > 1)
! 			{
! 				/* Use a sorting specialization if available */
! 				if (state->sortKeys &&
! 						state->sortKeys->qsort_arg_spec)
! 				{
! 					state->sortKeys->qsort_arg_spec((void *) state->memtuples,
! 								  state->memtupcount,
! 								  sizeof(SortTuple),
! 								  (void *) state);
! 				}
! 				else
! 				{
! 					/*
! 					 * Fall back on regular qsort_arg, with function pointer
! 					 * comparator, making no static assumptions about the number
! 					 * of sortkeys, datatype, or type of tuples that are
! 					 * sorted.
! 					 */
! 					qsort_arg((void *) state->memtuples,
! 								  state->memtupcount,
! 								  sizeof(SortTuple),
! 								  (qsort_arg_comparator) state->comparetup,
! 								  (void *) state);
! 				}
! 			}
  			state->current = 0;
  			state->eof_reached = false;
  			state->markpos_offset = 0;
*************** inlineApplySortFunction(FmgrInfo *sortFu
*** 2645,2653 ****
--- 2804,2840 ----
  	return compare;
  }
  
+ /*
+  * This is a cut-down duplicate of comparetup_heap/comparetup_index_btree that
+  * exists for the express purpose of generating a sorting specialization in
+  * which it is inlined, as a performance optimization. It can be used equally
+  * well for both classes of tuple, as it does not make any assumptions about
+  * the representation of the tuple proper.
+  *
+  * This is only possible for sorts with a single sortKey. Note that the
+  * comparator proper is not inlined, so this is only a partial specialization.
+  */
+ static inline int
+ comparetup_inline(void *a, void *b, Tuplesortstate *state)
+ {
+ 	/* void* parameters are used to shut-up the compiler */
+ 	const SortTuple *aT = (const SortTuple*) a, *bT = (const SortTuple*) b;
+ 	/* Allow interrupting long sorts */
+ 	CHECK_FOR_INTERRUPTS();
+ 
+ 	Assert(state->nKeys == 1);
+ 
+ 	/* Compare the one and only sort key with minimal indirection */
+ 	return ApplySortComparator(aT->datum1, aT->isnull1,
+ 								  bT->datum1, bT->isnull1,
+ 								  state->sortKeys);
+ }
  
  /*
   * Routines specialized for HeapTuple (actually MinimalTuple) case
+  *
+  * This code is partially duplicated within template_qsort_arg.h. Ensure that
+  * they are kept consistent.
   */
  
  static int
*************** readtup_cluster(Tuplesortstate *state, S
*** 2974,2979 ****
--- 3161,3169 ----
   * The btree and hash cases require separate comparison functions, but the
   * IndexTuple representation is the same so the copy/write/read support
   * functions can be shared.
+  *
+  * This code is partially duplicated within template_qsort_arg.h. Ensure that
+  * they are kept consistent.
   */
  
  static int
*************** comparetup_index_btree(const SortTuple *
*** 2981,2990 ****
  					   Tuplesortstate *state)
  {
  	/*
! 	 * This is similar to _bt_tuplecompare(), but we have already done the
! 	 * index_getattr calls for the first column, and we need to keep track of
! 	 * whether any null fields are present.  Also see the special treatment
! 	 * for equal keys at the end.
  	 */
  	ScanKey		scanKey = state->indexScanKey;
  	IndexTuple	tuple1;
--- 3171,3179 ----
  					   Tuplesortstate *state)
  {
  	/*
! 	 * We have already done the index_getattr calls for the first column, and we
! 	 * need to keep track of whether any null fields are present.  Also see the
! 	 * special treatment for equal keys at the end.
  	 */
  	ScanKey		scanKey = state->indexScanKey;
  	IndexTuple	tuple1;
*************** comparetup_index_btree(const SortTuple *
*** 3046,3063 ****
  	 * they *must* get compared at some stage of the sort --- otherwise the
  	 * sort algorithm wouldn't have checked whether one must appear before the
  	 * other.
- 	 *
- 	 * Some rather brain-dead implementations of qsort will sometimes call the
- 	 * comparison routine to compare a value to itself.  (At this writing only
- 	 * QNX 4 is known to do such silly things; we don't support QNX anymore,
- 	 * but perhaps the behavior still exists elsewhere.)  Don't raise a bogus
- 	 * error in that case.
  	 */
! 	if (state->enforceUnique && !equal_hasnull && tuple1 != tuple2)
  	{
  		Datum		values[INDEX_MAX_KEYS];
  		bool		isnull[INDEX_MAX_KEYS];
  
  		index_deform_tuple(tuple1, tupDes, values, isnull);
  		ereport(ERROR,
  				(errcode(ERRCODE_UNIQUE_VIOLATION),
--- 3235,3255 ----
  	 * they *must* get compared at some stage of the sort --- otherwise the
  	 * sort algorithm wouldn't have checked whether one must appear before the
  	 * other.
  	 */
! 	if (state->enforceUnique && !equal_hasnull)
  	{
  		Datum		values[INDEX_MAX_KEYS];
  		bool		isnull[INDEX_MAX_KEYS];
  
+ 		/*
+ 		 * In the past, it was conceivable that we'd have to protect against
+ 		 * a comparison of a tuple to itself, because we used the system
+ 		 * qsort(), and as such had to assume the worst about the implementation's
+ 		 * bogosity. This is no longer the case, though this assertion serves to
+ 		 * prevent that problem from re-emerging.
+ 		 */
+ 		Assert(tuple1 != tuple2);
+ 
  		index_deform_tuple(tuple1, tupDes, values, isnull);
  		ereport(ERROR,
  				(errcode(ERRCODE_UNIQUE_VIOLATION),
*************** comparetup_index_btree(const SortTuple *
*** 3067,3094 ****
  						   BuildIndexValueDescription(state->indexRel,
  													  values, isnull))));
  	}
- 
- 	/*
- 	 * If key values are equal, we sort on ItemPointer.  This does not affect
- 	 * validity of the finished index, but it offers cheap insurance against
- 	 * performance problems with bad qsort implementations that have trouble
- 	 * with large numbers of equal keys.
- 	 */
- 	{
- 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
- 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
- 
- 		if (blk1 != blk2)
- 			return (blk1 < blk2) ? -1 : 1;
- 	}
- 	{
- 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
- 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
- 
- 		if (pos1 != pos2)
- 			return (pos1 < pos2) ? -1 : 1;
- 	}
- 
  	return 0;
  }
  
--- 3259,3264 ----
*************** comparetup_index_hash(const SortTuple *a
*** 3098,3105 ****
  {
  	uint32		hash1;
  	uint32		hash2;
- 	IndexTuple	tuple1;
- 	IndexTuple	tuple2;
  
  	/* Allow interrupting long sorts */
  	CHECK_FOR_INTERRUPTS();
--- 3268,3273 ----
*************** comparetup_index_hash(const SortTuple *a
*** 3118,3147 ****
  	else if (hash1 < hash2)
  		return -1;
  
- 	/*
- 	 * If hash values are equal, we sort on ItemPointer.  This does not affect
- 	 * validity of the finished index, but it offers cheap insurance against
- 	 * performance problems with bad qsort implementations that have trouble
- 	 * with large numbers of equal keys.
- 	 */
- 	tuple1 = (IndexTuple) a->tuple;
- 	tuple2 = (IndexTuple) b->tuple;
- 
- 	{
- 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
- 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
- 
- 		if (blk1 != blk2)
- 			return (blk1 < blk2) ? -1 : 1;
- 	}
- 	{
- 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
- 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
- 
- 		if (pos1 != pos2)
- 			return (pos1 < pos2) ? -1 : 1;
- 	}
- 
  	return 0;
  }
  
--- 3286,3291 ----
diff --git a/src/include/c.h b/src/include/c.h
new file mode 100644
index 5bed719..0e61ed2
*** a/src/include/c.h
--- b/src/include/c.h
*************** extern int	fdatasync(int fildes);
*** 850,853 ****
--- 850,865 ----
  /* /port compatibility functions */
  #include "port.h"
  
+ /*
+  * Define a cross-platform "always-inline" macro. This is a very sharp tool that
+  * should be used judiciously.
+  */
+ #ifdef __always_inline
+ #define pg_always_inline __always_inline
+ #elif defined(__force_inline)
+ #define pg_always_inline __force_inline
+ #else
+ #define pg_always_inline inline
+ #endif
+ 
  #endif   /* C_H */
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
new file mode 100644
index aa36db6..d0ddbeb
*** a/src/include/utils/builtins.h
--- b/src/include/utils/builtins.h
*************** extern Datum btfloat8sortsupport(PG_FUNC
*** 323,328 ****
--- 323,463 ----
  extern Datum btoidsortsupport(PG_FUNCTION_ARGS);
  extern Datum btnamesortsupport(PG_FUNCTION_ARGS);
  
+ /*
+  *		Expose some "fast" variants of per-opclass comparison functions that
+  *		will incidentally be returned by some of the above sort-support
+  *		functions. Do so to	avail of optimizations from compile-time knowledge
+  *		of comparators.
+  *
+  *		Exposed variants don't touch SortSupportData struct, so that parameter
+  *		can	remain opaque.
+  */
+ 
+ /* Declaration is sufficient */
+ #ifndef SORTSUPPORT_H
+ #define DECLARE_SORT_SUPPORT
+ typedef struct SortSupportData *SortSupport;
+ #endif
+ 
+ #ifdef USE_INLINE
+ 
+ static inline int
+ btint4fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	int32		a = DatumGetInt32(x);
+ 	int32		b = DatumGetInt32(y);
+ 
+ 	if (a > b)
+ 		return 1;
+ 	else if (a == b)
+ 		return 0;
+ 	else
+ 		return -1;
+ }
+ 
+ static inline int
+ btint8fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	int64		a = DatumGetInt64(x);
+ 	int64		b = DatumGetInt64(y);
+ 
+ 	if (a > b)
+ 		return 1;
+ 	else if (a == b)
+ 		return 0;
+ 	else
+ 		return -1;
+ }
+ 
+ static inline int
+ float4_cmp_internal(float4 a, float4 b)
+ {
+ 	/*
+ 	 * We consider all NANs to be equal and larger than any non-NAN. This is
+ 	 * somewhat arbitrary; the important thing is to have a consistent sort
+ 	 * order.
+ 	 */
+ 	if (isnan(a))
+ 	{
+ 		if (isnan(b))
+ 			return 0;			/* NAN = NAN */
+ 		else
+ 			return 1;			/* NAN > non-NAN */
+ 	}
+ 	else if (isnan(b))
+ 	{
+ 		return -1;				/* non-NAN < NAN */
+ 	}
+ 	else
+ 	{
+ 		if (a > b)
+ 			return 1;
+ 		else if (a < b)
+ 			return -1;
+ 		else
+ 			return 0;
+ 	}
+ }
+ 
+ static inline int
+ float8_cmp_internal(float8 a, float8 b)
+ {
+ 	/*
+ 	 * We consider all NANs to be equal and larger than any non-NAN. This is
+ 	 * somewhat arbitrary; the important thing is to have a consistent sort
+ 	 * order.
+ 	 */
+ 	if (isnan(a))
+ 	{
+ 		if (isnan(b))
+ 			return 0;			/* NAN = NAN */
+ 		else
+ 			return 1;			/* NAN > non-NAN */
+ 	}
+ 	else if (isnan(b))
+ 	{
+ 		return -1;				/* non-NAN < NAN */
+ 	}
+ 	else
+ 	{
+ 		if (a > b)
+ 			return 1;
+ 		else if (a < b)
+ 			return -1;
+ 		else
+ 			return 0;
+ 	}
+ }
+ 
+ static inline int
+ btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	float4		arg1 = DatumGetFloat4(x);
+ 	float4		arg2 = DatumGetFloat4(y);
+ 
+ 	return float4_cmp_internal(arg1, arg2);
+ }
+ 
+ static inline int
+ btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	float8		arg1 = DatumGetFloat8(x);
+ 	float8		arg2 = DatumGetFloat8(y);
+ 
+ 	return float8_cmp_internal(arg1, arg2);
+ }
+ 
+ #else
+ 
+ extern int btint4fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int btint8fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int float4_cmp_internal(float4 a, float4 b);
+ extern int float8_cmp_internal(float8 a, float8 b);
+ extern int btfloat4fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int btfloat8fastcmp(Datum x, Datum y, SortSupport ssup);
+ 
+ #endif   /* USE_INLINE */
+ 
  /* float.c */
  extern PGDLLIMPORT int extra_float_digits;
  
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
new file mode 100644
index bd6dde3..88a4315
*** a/src/include/utils/sortsupport.h
--- b/src/include/utils/sortsupport.h
***************
*** 31,36 ****
--- 31,42 ----
   * data can be stored using the ssup_extra field.  Any such data
   * should be allocated in the ssup_cxt memory context.
   *
+  * In addition to providing lower-overhead comparators, this infrastructure also
+  * resolves a generic comparator for the type, if one is available from a
+  * static set, each of which corresponds to a generic Datum representations.
+  * These are available only for a subset of built-in sort support functions.
+  * This enables more aggressive per-type optimizations at sort time.
+  *
   * Note: since pg_amproc functions are indexed by (lefttype, righttype)
   * it is possible to associate a BTSORTSUPPORT function with a cross-type
   * comparison.  This could sensibly be used to provide a fast comparator
***************
*** 49,55 ****
--- 55,90 ----
  
  #include "access/attnum.h"
  
+ /*
+  * Which comparator representation can be used for this type? It is acceptable
+  * to use an int4 comparator with the date datatype for example, because both
+  * types have the same underlying representation. In particular, their
+  * comparators are interchangeable.
+  *
+  * It is incorrect to do this with types that share a certain bitwise
+  * representation with some scalar type, represented by an enum constant here,
+  * when they do not have interchangeable comparators. For example, sortkeys of
+  * legacy float8 representation of timestamps will not be set to
+  * TYPE_COMP_FLOAT8, because its comparator has special handling of NaN values.
+  *
+  * To an even greater extent than with the optimization whereby a btree index
+  * opclass provides a "fast" comparator to elide SQL function call overhead,
+  * it's not useful to do this with types that have inherently high-overhead
+  * comparisons, of greater than a few instructions; the cost of the comparison
+  * itself is expected to dominate, marginalizing any benefit.
+  */
+ typedef enum TypeCompar
+ {
+ 	TYPE_COMP_INT4,
+ 	TYPE_COMP_INT8,
+ 	TYPE_COMP_FLOAT4,
+ 	TYPE_COMP_FLOAT8,
+ 	TYPE_COMP_OTHER
+ } TypeCompar;
+ 
+ #ifndef DECLARE_SORT_SUPPORT
  typedef struct SortSupportData *SortSupport;
+ #endif
  
  typedef struct SortSupportData
  {
*************** typedef struct SortSupportData
*** 96,103 ****
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * Additional sort-acceleration functions might be added here later.
  	 */
  } SortSupportData;
  
  
--- 131,161 ----
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * This specialization function pointer is sometimes used as an alternative
! 	 * to the standard qsort_arg, when it has been determined that we can
! 	 * benefit from various per-type and per number of sort key performance
! 	 * optimizations.
! 	 *
! 	 * Often, this will simply entail using a variant that inlines the comparator
! 	 * that was previously used (through a pointer) by the highly generic
! 	 * qsort_arg, such as comparetup_heap, which encapsulate the details of
! 	 * performing a comparison on the class of tuple in question, and are where
! 	 * the call is made to MySortSupport->comparator. This isn't always done
! 	 * because it isn't useful to attempt to get the compiler to inline in cases
! 	 * where there is more than a single sort key.
! 	 *
! 	 * In some other cases, particularly with scalar datatypes that are assumed
! 	 * to be sorted far more frequently in practice, the specialization goes so
! 	 * far as to inline the comparator proper from within the tuple-class
! 	 * encapsulating comparator.
  	 */
+ 	void (*qsort_arg_spec)(void *a, size_t n, size_t es, void *arg);
+ 
+ 	/*
+ 	 * Which type-specific comparator proper (which is part of a fully inlined
+ 	 * specialization) can be safely used, if any?
+ 	 */
+ 	TypeCompar usable_compar;
  } SortSupportData;
  
  
*************** extern int	ApplySortComparator(Datum dat
*** 151,156 ****
--- 209,215 ----
  
  /* Other functions in utils/sort/sortsupport.c */
  extern void PrepareSortSupportComparisonShim(Oid cmpFunc, SortSupport ssup);
+ extern TypeCompar ResolveComparatorProper(Oid orderingOp);
  extern void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup);
  
  #endif   /* SORTSUPPORT_H */
diff --git a/src/include/utils/template_qsort_arg.h b/src/include/utils/template_qsort_arg.h
new file mode 100644
index ...de1deb8
*** a/src/include/utils/template_qsort_arg.h
--- b/src/include/utils/template_qsort_arg.h
***************
*** 0 ****
--- 1,411 ----
+ /*-------------------------------------------------------------------------
+  *	template_qsort_arg.h: "template" version of qsort_arg.c
+  *
+  *	This version of qsort_arg is exclusively used within tuplesort.c to	more
+  *	efficiently sort common types such as integers and floats. In providing
+  *	this version, we seek to take advantage of compile-time	optimizations,
+  *	in particular, the inlining of comparators. In some cases the "tuple class"
+  *	encapsulating comparator (that is, the particular comparator used directly
+  *	here, which depends on whether we're sorting heap tuples or btree index
+  *	tuples) is just inlined, when a full specialization is unavailable. In
+  *	other cases, the comparator proper is also inlined, for full integration of
+  *	the sort routine. There are variant specializations for cases with only a
+  *	single sortkey, and cases with multiple sortkeys.
+  *
+  *	The TEMPLATE_QSORT_ARG() macro generates an inlining variant (for sorts
+  *	with a single sortKey) and non-inlining variant for sorts with multiple
+  *	sortKeys.
+  *
+  *	We rely on various function declarations, and indeed partially duplicate
+  *	some code from tuplesort.c, so this file should be considered private to
+  *	that module, rather than a generic piece of infrastructure.
+  *
+  *	CAUTION: if you change this file, see also qsort_arg.c as well as
+  *	qsort.c. qsort_arg.c should be considered authoratative.
+  *
+  *	Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+  *	Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *	src/include/utils/template_qsort_arg.h
+  *-------------------------------------------------------------------------
+  */
+ #include "c.h"
+ 
+ #define swapcode(TYPE, parmi, parmj, n)		\
+ do {										\
+ 	size_t i = (n) / sizeof (TYPE);			\
+ 	TYPE *pi = (TYPE *)(void *)(parmi);		\
+ 	TYPE *pj = (TYPE *)(void *)(parmj);		\
+ 	do {									\
+ 		TYPE	t = *pi;					\
+ 		*pi++ = *pj;						\
+ 		*pj++ = t;							\
+ 		} while (--i > 0);					\
+ } while (0)
+ 
+ #define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
+ 	(es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
+ 
+ #define thistype_vecswap(a, b, n)								\
+ 	if ((n) > 0) inl_swapfunc((a), (b), (size_t)(n), swaptype)
+ 
+ #define thistype_swap(a, b)								\
+ if (swaptype == 0) {									\
+ 		long t = *(long *)(void *)(a);					\
+ 		*(long *)(void *)(a) = *(long *)(void *)(b);	\
+ 		*(long *)(void *)(b) = t;						\
+ 	} else												\
+ 		inl_swapfunc(a, b, es, swaptype)
+ 
+ inline static void
+ inl_swapfunc(char *a, char *b, size_t n, int swaptype)
+ {
+ 	if (swaptype <= 1)
+ 		swapcode(long, a, b, n);
+ 	else
+ 		swapcode(char, a, b, n);
+ }
+ 
+ /*
+  * This macro manufactures a type-specific implementation of qsort_arg with
+  * the comparator, COMPAR, known at compile time. COMPAR is typically an
+  * inline function.
+  *
+  * COMPAR should take as its arguments two Datums, and return an int, in
+  * line with standard qsort convention.
+  *
+  * We have void* parameters for TYPE##comparetup_inline just to shut up the compiler.
+  * They could be SortTuple pointers instead, but that would make it more
+  * difficult to keep template_qsort_arg.h consistent with tuplesort.c.
+  */
+ 
+ #define DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, SPEC_VAR, REG_ADD_CODE, OUT_COMPAR)		\
+ void TYPE##_##SPEC_VAR##_qsort_arg(void *a, size_t n, size_t es, void *arg);		\
+ 																					\
+ inline static int32																	\
+ TYPE##SPEC_VAR##AppFunc(Datum datum1, bool isnull1, Datum datum2, bool isnull2,		\
+ 							SortSupport	sortKey)        							\
+ {																					\
+ 	int32		compare;															\
+ 	if (isnull1)																	\
+ 	{																				\
+ 		if (isnull2)																\
+ 			compare = 0;		/* NULL "=" NULL */									\
+ 		else if (sortKey->ssup_nulls_first)											\
+ 			compare = -1;		/* NULL "<" NOT_NULL */								\
+ 		else																		\
+ 			compare = 1;		/* NULL ">" NOT_NULL */								\
+ 	}																				\
+ 	else if (isnull2)																\
+ 	{																				\
+ 		if (sortKey->ssup_nulls_first)												\
+ 			compare = 1;		/* NOT_NULL ">" NULL */								\
+ 		else																		\
+ 			compare = -1;		/* NOT_NULL "<" NULL */								\
+ 	}																				\
+ 	else																			\
+ 	{																				\
+ 		compare = COMPAR(datum1, datum2);											\
+ 																					\
+ 		if (sortKey->ssup_reverse)													\
+ 			compare = -compare;														\
+ 	}																				\
+ 	return compare;																	\
+ }																					\
+ 																					\
+ /*																					\
+  * Note that this is heavily based on comparetup_inline; the two should be kept		\
+  * consistent.																		\
+  */																					\
+ pg_always_inline static int															\
+ TYPE##SPEC_VAR##comparetup_inline(const void *a, const void *b,						\
+ 		Tuplesortstate *state)														\
+ {																					\
+ 	const SortTuple* aT = a;														\
+ 	const SortTuple* bT = b;														\
+ 	int32		compare;															\
+ 	SortSupport	sortKey = state->sortKeys;											\
+ 																					\
+ 	/* Allow interrupting long sorts */												\
+ 	CHECK_FOR_INTERRUPTS();															\
+ 	compare = TYPE##SPEC_VAR##AppFunc(aT->datum1, aT->isnull1, bT->datum1,			\
+ 										bT->isnull1, sortKey);						\
+ 	if (compare != 0) 																\
+ 		return compare;																\
+ 	/* Additional code for variants with more than one sortkey */					\
+ 	REG_ADD_CODE 																	\
+ 	return 0;																		\
+ }																					\
+ 																					\
+ 																\
+ inline static char *											\
+ TYPE##SPEC_VAR##med3(char *a, char *b, char *c, void *arg)		\
+ {																\
+ 	return OUT_COMPAR(a, b, arg) < 0 ?							\
+ 		(OUT_COMPAR(b, c, arg) < 0 ? 							\
+ 					b : (OUT_COMPAR(a, c, arg) < 0 ? c : a))	\
+ 		: (OUT_COMPAR(b, c, arg) > 0 ? 							\
+ 					b : (OUT_COMPAR(a, c, arg) < 0 ? a : c));	\
+ }																\
+ 																\
+ void																		\
+ TYPE##_##SPEC_VAR##_qsort_arg(void *a, size_t n, size_t es, void *arg)		\
+ {																			\
+ 	char	   *pa,															\
+ 			   *pb,															\
+ 			   *pc,															\
+ 			   *pd,															\
+ 			   *pl,															\
+ 			   *pm,															\
+ 			   *pn;															\
+ 	int			d,															\
+ 				r,															\
+ 				swaptype,													\
+ 				presorted;													\
+ 																			\
+ loop:SWAPINIT(a, es); 														\
+ 	if (n < 7)																\
+ 	{																		\
+ 		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)		\
+ 			for (pl = pm; pl > (char *) a && 								\
+ 					OUT_COMPAR(pl - es, pl, arg) > 0;						\
+ 				 pl -= es)													\
+ 				thistype_swap(pl, pl - es);									\
+ 		return;																\
+ 	}																		\
+ 	presorted = 1;															\
+ 	for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)			\
+ 	{																		\
+ 		if (OUT_COMPAR(pm - es, pm, arg) > 0)								\
+ 		{																	\
+ 			presorted = 0;													\
+ 			break;															\
+ 		}																	\
+ 	}																		\
+ 	if (presorted)															\
+ 		return;																\
+ 	pm = (char *) a + (n / 2) * es;											\
+ 	if (n > 7)																\
+ 	{																		\
+ 		pl = (char *) a;													\
+ 		pn = (char *) a + (n - 1) * es;										\
+ 		if (n > 40)															\
+ 		{																	\
+ 			d = (n / 8) * es;												\
+ 			pl = TYPE##SPEC_VAR##med3(pl, pl + d, pl + 2 * d, arg);			\
+ 			pm = TYPE##SPEC_VAR##med3(pm - d, pm, pm + d, arg);				\
+ 			pn = TYPE##SPEC_VAR##med3(pn - 2 * d, pn - d, pn, arg);			\
+ 		}																	\
+ 		pm = TYPE##SPEC_VAR##med3(pl, pm, pn, arg);							\
+ 	}																		\
+ 	thistype_swap(a, pm);													\
+ 	pa = pb = (char *) a + es;												\
+ 	pc = pd = (char *) a + (n - 1) * es;									\
+ 	for (;;)																\
+ 	{																		\
+ 		while (pb <= pc && 													\
+ 					(r = OUT_COMPAR(pb, a, arg)) <= 0)						\
+ 		{																	\
+ 			if (r == 0)														\
+ 			{																\
+ 				thistype_swap(pa, pb);										\
+ 				pa += es;													\
+ 			}																\
+ 			pb += es;														\
+ 		}																	\
+ 		while (pb <= pc && 													\
+ 				(r = OUT_COMPAR(pc, a, arg)) >= 0)							\
+ 		{																	\
+ 			if (r == 0)														\
+ 			{																\
+ 				thistype_swap(pc, pd);										\
+ 				pd -= es;													\
+ 			}																\
+ 			pc -= es;														\
+ 		}																	\
+ 		if (pb > pc)														\
+ 			break;															\
+ 		thistype_swap(pb, pc);												\
+ 		pb += es;															\
+ 		pc -= es;															\
+ 	}																		\
+ 	pn = (char *) a + n * es;												\
+ 	r = Min(pa - (char *) a, pb - pa);										\
+ 	thistype_vecswap(a, pb - r, r);											\
+ 	r = Min(pd - pc, pn - pd - es);											\
+ 	thistype_vecswap(pb, pn - r, r);										\
+ 	if ((r = pb - pa) > es)													\
+ 		TYPE##_##SPEC_VAR##_qsort_arg(a, r / es, es, arg);					\
+ 	if ((r = pd - pc) > es)													\
+ 	{																		\
+ 		/* Iterate rather than recurse to save stack space */				\
+ 		a = pn - r;															\
+ 		n = r / es;															\
+ 		goto loop;															\
+ 	}																		\
+ }
+ 
+ /*
+  * This code becomes part of the comparator meta-function for the "reg"
+  * specialization variant of each datatype-specific specialization.
+  *
+  * Note that this is heavily based on tuplesort_comparetup_heap; the two should
+  * be kept consistent.
+  *
+  * For "reg", we can handle multiple sortKeys, but the function generally
+  * will not be inlined directly when sorting. We'll try and use compile time
+  * knowledge of the comparator for later sortKeys, on the assumption that that
+  * will be a win frequently enough to justify the overhead.
+  *
+  * Modern compilers are not inclined to pay too much attention to the inline
+  * keyword, and indeed inline at the callsite granularity, rather than the
+  * function granularity. It should not be assumed that the second call to the
+  * "inline" comparator here will result in a second copy of the comparator.
+  */
+ 
+ #define REG_ADDITIONAL_HEAP_CODE(COMPAR)									\
+ {																			\
+ 	HeapTupleData ltup;														\
+ 	HeapTupleData rtup;														\
+ 	TupleDesc	tupDesc;													\
+ 	int			nkey;														\
+ 																			\
+ 	/* Compare additional sort keys */										\
+ 	ltup.t_len =															\
+ 		((MinimalTuple) aT->tuple)->t_len + MINIMAL_TUPLE_OFFSET;			\
+ 	ltup.t_data =															\
+ 		(HeapTupleHeader) ((char *) aT->tuple - MINIMAL_TUPLE_OFFSET);		\
+ 	rtup.t_len =															\
+ 		((MinimalTuple) bT->tuple)->t_len + MINIMAL_TUPLE_OFFSET;			\
+ 	rtup.t_data = 															\
+ 		(HeapTupleHeader) ((char *) bT->tuple - MINIMAL_TUPLE_OFFSET);		\
+ 	tupDesc = state->tupDesc;												\
+ 	sortKey++;																\
+ 	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)					\
+ 	{																		\
+ 		AttrNumber	attno = sortKey->ssup_attno;							\
+ 		Datum		datum1,													\
+ 					datum2;													\
+ 		bool		isnull1,												\
+ 					isnull2;												\
+ 																			\
+ 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);				\
+ 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);				\
+ 																			\
+ 		if (sortKey->usable_compar == state->sortKeys->usable_compar)		\
+ 			compare = COMPAR(datum1, isnull1,								\
+ 								datum2, isnull2,							\
+ 								sortKey);									\
+ 		else																\
+ 			compare = ApplySortComparator(datum1, isnull1,					\
+ 										  datum2, isnull2,					\
+ 										  sortKey);							\
+ 		if (compare != 0)													\
+ 			return compare;													\
+ 	}																		\
+ }
+ 
+ /*
+  * REG_ADDITIONAL_BTREE_CODE() is analogous to REG_ADDITIONAL_HEAP_CODE(), and
+  * is present in btree multiple sortkey specializations.
+  *
+  * Note that this is heavily based on tuplesort_comparetup_btree; the two should
+  * be kept consistent.
+  */
+ #define REG_ADDITIONAL_BTREE_CODE(COMPAR, DEDUP_BTREE)						\
+ {																			\
+ 	/* Compare additional sort keys */										\
+ 	IndexTuple tuple1 = (IndexTuple) aT->tuple;								\
+ 	IndexTuple tuple2 = (IndexTuple) bT->tuple;								\
+ 	int keysz = state->nKeys;												\
+ 	int			nkey;														\
+ 	TupleDesc	tupDes = RelationGetDescr(state->indexRel);					\
+ 	bool		equal_hasnull = false;										\
+ 	sortKey++;																\
+ 	for (nkey = 2; nkey <= keysz; nkey++, sortKey++)						\
+ 	{																		\
+ 		Datum		datum1,													\
+ 					datum2;													\
+ 		bool		isnull1,												\
+ 					isnull2;												\
+ 																			\
+ 		datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);				\
+ 		datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);				\
+ 																			\
+ 		if (sortKey->usable_compar == state->sortKeys->usable_compar)		\
+ 			compare = COMPAR(datum1, isnull1,								\
+ 								datum2, isnull2,							\
+ 								sortKey);									\
+ 		else																\
+ 			compare = ApplySortComparator(datum1, isnull1,					\
+ 										  datum2, isnull2,					\
+ 										  sortKey);							\
+ 		if (compare != 0)													\
+ 			return compare;		/* done when we find unequal attributes */	\
+ 																			\
+ 		/* they are equal, so we only need to examine one null flag */		\
+ 		if (isnull1)														\
+ 			equal_hasnull = true;											\
+ 	}																		\
+ 	/*																		\
+ 	 * We may or may not have to enforce uniqueness in this					\
+ 	 * specialization														\
+ 	 */																		\
+ 	DEDUP_BTREE																\
+ }
+ 
+ #define INL_ADDITIONAL_CODE Assert(state->nKeys == 1);
+ 
+ #define DEDUP_BTREE															\
+ {																			\
+ 	/*																		\
+ 	 * btree has asked us to enforce uniqueness (otherwise this				\
+ 	 * specialization wouldn't be used), complain if two equal tuples		\
+ 	 * are detected (unless there was at least one NULL field).				\
+ 	 */																		\
+ 	if (!equal_hasnull)														\
+ 	{																		\
+ 		Datum		values[INDEX_MAX_KEYS];									\
+ 		bool		isnull[INDEX_MAX_KEYS];									\
+ 																			\
+ 		Assert(tuple1 != tuple2);											\
+ 																			\
+ 		index_deform_tuple(tuple1, tupDes, values, isnull);					\
+ 		ereport(ERROR,														\
+ 				(errcode(ERRCODE_UNIQUE_VIOLATION),							\
+ 				 errmsg("could not create unique index \"%s\"",				\
+ 						RelationGetRelationName(state->indexRel)),			\
+ 				 errdetail("Key %s is duplicated.",							\
+ 						   BuildIndexValueDescription(state->indexRel,		\
+ 													  values, isnull))));	\
+ 	}																		\
+ }
+ /* Suppress compiler warning */
+ #define NOOP_BTREE (void)equal_hasnull;
+ 
+ /*
+  * Manufacture inlining variant for nKeys=1 case, and non-inlining variant
+  * for nKeys > 1 case
+  */
+ #define TEMPLATE_QSORT_ARG_HEAP(TYPE, COMPAR)								\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, inlheap,								\
+ 		INL_ADDITIONAL_CODE, TYPE##inlheapcomparetup_inline)				\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, regheap,								\
+ 		REG_ADDITIONAL_HEAP_CODE(TYPE##regheapAppFunc),						\
+ 			TYPE##regheapcomparetup_inline)
+ 
+ /*
+  * Btree sorting can use heap single key specializations equally well, so
+  * there's no point in generating single-key btree variants
+  *
+  * However, generate specializations that enforce uniqueness and others that
+  * do not
+  */
+ #define TEMPLATE_QSORT_ARG_BTREE(TYPE, COMPAR)								\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, regbtree,								\
+ 		REG_ADDITIONAL_BTREE_CODE(TYPE##regbtreeAppFunc, DEDUP_BTREE),		\
+ 			TYPE##regbtreecomparetup_inline)								\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, regbtreeuniq,							\
+ 		REG_ADDITIONAL_BTREE_CODE(TYPE##regbtreeAppFunc, NOOP_BTREE ),		\
+ 			TYPE##regbtreecomparetup_inline)								\
+ 
diff --git a/src/port/qsort.c b/src/port/qsort.c
new file mode 100644
index 8e2c6d9..d1981d6
*** a/src/port/qsort.c
--- b/src/port/qsort.c
***************
*** 7,13 ****
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort_arg.c
   *
   *	src/port/qsort.c
   */
--- 7,14 ----
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort_arg.c and
!  *	template_qsort_arg.h
   *
   *	src/port/qsort.c
   */
diff --git a/src/port/qsort_arg.c b/src/port/qsort_arg.c
new file mode 100644
index 28d1894..0ab6198
*** a/src/port/qsort_arg.c
--- b/src/port/qsort_arg.c
***************
*** 7,13 ****
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort.c
   *
   *	src/port/qsort_arg.c
   */
--- 7,13 ----
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort.c and template_qsort_arg.h
   *
   *	src/port/qsort_arg.c
   */
