diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index fedde93..128376d
*** a/src/backend/access/nbtree/nbtcompare.c
--- b/src/backend/access/nbtree/nbtcompare.c
***************
*** 49,55 ****
  #include "postgres.h"
  
  #include "utils/builtins.h"
- #include "utils/sortsupport.h"
  
  
  Datum
--- 49,54 ----
*************** btint4cmp(PG_FUNCTION_ARGS)
*** 102,108 ****
  		PG_RETURN_INT32(-1);
  }
  
! static int
  btint4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int32		a = DatumGetInt32(x);
--- 101,109 ----
  		PG_RETURN_INT32(-1);
  }
  
! #ifndef USE_INLINE
! 
! int
  btint4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int32		a = DatumGetInt32(x);
*************** btint4fastcmp(Datum x, Datum y, SortSupp
*** 116,121 ****
--- 117,124 ----
  		return -1;
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btint4sortsupport(PG_FUNCTION_ARGS)
  {
*************** btint8cmp(PG_FUNCTION_ARGS)
*** 139,145 ****
  		PG_RETURN_INT32(-1);
  }
  
! static int
  btint8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int64		a = DatumGetInt64(x);
--- 142,150 ----
  		PG_RETURN_INT32(-1);
  }
  
! #ifndef USE_INLINE
! 
! int
  btint8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	int64		a = DatumGetInt64(x);
*************** btint8fastcmp(Datum x, Datum y, SortSupp
*** 153,158 ****
--- 158,165 ----
  		return -1;
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btint8sortsupport(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
new file mode 100644
index 08434dc..2265f3e
*** a/src/backend/utils/adt/float.c
--- b/src/backend/utils/adt/float.c
***************
*** 23,29 ****
  #include "libpq/pqformat.h"
  #include "utils/array.h"
  #include "utils/builtins.h"
- #include "utils/sortsupport.h"
  
  
  #ifndef M_PI
--- 23,28 ----
*************** do {															\
*** 67,76 ****
  /* Configurable GUC parameter */
  int			extra_float_digits = 0;		/* Added to DBL_DIG or FLT_DIG */
  
- 
- static int	float4_cmp_internal(float4 a, float4 b);
- static int	float8_cmp_internal(float8 a, float8 b);
- 
  #ifndef HAVE_CBRT
  /*
   * Some machines (in particular, some versions of AIX) have an extern
--- 66,71 ----
*************** float8div(PG_FUNCTION_ARGS)
*** 844,850 ****
  /*
   *		float4{eq,ne,lt,le,gt,ge}		- float4/float4 comparison operations
   */
! static int
  float4_cmp_internal(float4 a, float4 b)
  {
  	/*
--- 839,847 ----
  /*
   *		float4{eq,ne,lt,le,gt,ge}		- float4/float4 comparison operations
   */
! #ifndef USE_INLINE
! 
! int
  float4_cmp_internal(float4 a, float4 b)
  {
  	/*
*************** float4_cmp_internal(float4 a, float4 b)
*** 874,879 ****
--- 871,878 ----
  	}
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  float4eq(PG_FUNCTION_ARGS)
  {
*************** btfloat4cmp(PG_FUNCTION_ARGS)
*** 937,943 ****
  	PG_RETURN_INT32(float4_cmp_internal(arg1, arg2));
  }
  
! static int
  btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float4		arg1 = DatumGetFloat4(x);
--- 936,945 ----
  	PG_RETURN_INT32(float4_cmp_internal(arg1, arg2));
  }
  
! 
! #ifndef USE_INLINE
! 
! int
  btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float4		arg1 = DatumGetFloat4(x);
*************** btfloat4fastcmp(Datum x, Datum y, SortSu
*** 946,951 ****
--- 948,955 ----
  	return float4_cmp_internal(arg1, arg2);
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btfloat4sortsupport(PG_FUNCTION_ARGS)
  {
*************** btfloat4sortsupport(PG_FUNCTION_ARGS)
*** 958,964 ****
  /*
   *		float8{eq,ne,lt,le,gt,ge}		- float8/float8 comparison operations
   */
! static int
  float8_cmp_internal(float8 a, float8 b)
  {
  	/*
--- 962,971 ----
  /*
   *		float8{eq,ne,lt,le,gt,ge}		- float8/float8 comparison operations
   */
! 
! #ifndef USE_INLINE
! 
! int
  float8_cmp_internal(float8 a, float8 b)
  {
  	/*
*************** float8_cmp_internal(float8 a, float8 b)
*** 988,993 ****
--- 995,1002 ----
  	}
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  float8eq(PG_FUNCTION_ARGS)
  {
*************** btfloat8cmp(PG_FUNCTION_ARGS)
*** 1051,1057 ****
  	PG_RETURN_INT32(float8_cmp_internal(arg1, arg2));
  }
  
! static int
  btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float8		arg1 = DatumGetFloat8(x);
--- 1060,1068 ----
  	PG_RETURN_INT32(float8_cmp_internal(arg1, arg2));
  }
  
! #ifndef USE_INLINE
! 
! int
  btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
  {
  	float8		arg1 = DatumGetFloat8(x);
*************** btfloat8fastcmp(Datum x, Datum y, SortSu
*** 1060,1065 ****
--- 1071,1078 ----
  	return float8_cmp_internal(arg1, arg2);
  }
  
+ #endif   /* ! USE_INLINE */
+ 
  Datum
  btfloat8sortsupport(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/sort/sortsupport.c b/src/backend/utils/sort/sortsupport.c
new file mode 100644
index 7f388fd..39d41a2
*** a/src/backend/utils/sort/sortsupport.c
--- b/src/backend/utils/sort/sortsupport.c
***************
*** 16,21 ****
--- 16,22 ----
  #include "postgres.h"
  
  #include "fmgr.h"
+ #include "utils/fmgroids.h"
  #include "utils/lsyscache.h"
  #include "utils/sortsupport.h"
  
*************** PrepareSortSupportComparisonShim(Oid cmp
*** 127,132 ****
--- 128,181 ----
  }
  
  /*
+  * Given a pg_proc sort support function, see if it's safe to sort using a
+  * custom specialization with a hard-coded generic comparator for scalar types,
+  * to faciliate an optimization that may later be used when sorting.
+  *
+  * This is actually technically quite orthogonal to the user-exposed API for
+  * providing a directly callable comparator to elide the SQL function call
+  * machinery, though that API may separately improve performance for the same
+  * sort under certain circumstances, as when a sort must be performed with
+  * multiple keys of different types, only the first of which has a comparator
+  * representation that we consider here. However, it is reasonable to map
+  * built-in sort support functions to a corresponding hard coded comparator
+  * representation, because it is always expected that the types that this
+  * optimization will be used with will be a subset of types with a built-in sort
+  * support function - if it wasn't worth doing the former due to the fact that
+  * the innate cost of each comparison marginalised any benefit, it certainly
+  * won't be worth doing the latter.  It is also reasonable because the fast
+  * comparator variants are, as a matter of policy, directly used for this
+  * optimization, so ipso facto a corresponding sort support function must be
+  * available.
+  */
+ TypeCompar
+ ResolveComparatorProper(Oid Proc)
+ {
+ 	switch (Proc)
+ 	{
+ 		/* Some comparison that has an underlying int4 representation */
+ 		case F_BTINT4SORTSUPPORT:
+ 			return TYPE_COMP_INT4;
+ 		case F_DATE_SORTSUPPORT:
+ 			return TYPE_COMP_INT4;
+ 		/* Some comparison that has an underlying int8 representation */
+ 		case F_BTINT8SORTSUPPORT:
+ 			return TYPE_COMP_INT8;
+ #ifdef HAVE_INT64_TIMESTAMP
+ 		case F_TIMESTAMP_SORTSUPPORT:
+ 			return TYPE_COMP_INT8;
+ #endif
+ 		/* floating point types */
+ 		case F_BTFLOAT4SORTSUPPORT:
+ 			return TYPE_COMP_FLOAT4;
+ 		case F_BTFLOAT8SORTSUPPORT:
+ 			return TYPE_COMP_FLOAT8;
+ 		default:
+ 			return TYPE_COMP_OTHER;
+ 	}
+ }
+ 
+ /*
   * Fill in SortSupport given an ordering operator (btree "<" or ">" operator).
   *
   * Caller must previously have zeroed the SortSupportData structure and then
*************** PrepareSortSupportFromOrderingOp(Oid ord
*** 157,160 ****
--- 206,214 ----
  		/* We'll use a shim to call the old-style btree comparator */
  		PrepareSortSupportComparisonShim(sortFunction, ssup);
  	}
+ 	/*
+ 	 * We may later avail of a further optimization for a few built-in scalar
+ 	 * types: inlining of the comparator proper.
+ 	 */
+ 	ssup->usable_compar = ResolveComparatorProper(sortFunction);
  }
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 4c2fe69..ea0164c
*** a/src/backend/utils/sort/tuplesort.c
--- b/src/backend/utils/sort/tuplesort.c
***************
*** 106,118 ****
--- 106,121 ----
  #include "executor/executor.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
+ #include "utils/builtins.h"
  #include "utils/datum.h"
+ #include "utils/fmgroids.h"
  #include "utils/logtape.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/pg_rusage.h"
  #include "utils/rel.h"
  #include "utils/sortsupport.h"
+ #include "utils/template_qsort_arg.h"
  #include "utils/tuplesort.h"
  
  
*************** static void tuplesort_heap_insert(Tuples
*** 456,461 ****
--- 459,465 ----
  static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
  static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
  static void markrunend(Tuplesortstate *state, int tapenum);
+ static inline int comparetup_inline(void *a, void *b, Tuplesortstate *state);
  static int comparetup_heap(const SortTuple *a, const SortTuple *b,
  				Tuplesortstate *state);
  static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
*************** static void readtup_datum(Tuplesortstate
*** 492,497 ****
--- 496,537 ----
  static void reversedirection_datum(Tuplesortstate *state);
  static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  
+ /*
+  * Manufacture type specific sorting specialisations with inline comparators.
+  *
+  * Use fast comparator functions for "comparator proper" in each case. They're
+  * generally used via function pointers to elide SQL-function-call indirection
+  * when resolving the comparator of a btree index opclass for sorting
+  * (historically, tuplesort went through pg_proc in all cases) but that doesn't
+  * provide the additional benefits of inlining and other optimizations from
+  * compile-time comparator knowledge, through the generation of complete
+  * per-type sorting specializations.
+  */
+ #define int4_shim(x,y)			btint4fastcmp(x, y, NULL)
+ #define int8_shim(x,y)			btint8fastcmp(x, y, NULL)
+ #define float4_shim(x,y)		btfloat4fastcmp(x, y, NULL)
+ #define float8_shim(x,y)		btfloat8fastcmp(x, y, NULL)
+ 
+ #define noop_cmp_proper(nop1, nop2) 0
+ #define noop_code (void)0;
+ 
+ /* Instantiate sorting specializations (for heap tuples only) */
+ TEMPLATE_QSORT_ARG_HEAP(int4, int4_shim);
+ TEMPLATE_QSORT_ARG_HEAP(int8, int8_shim);
+ TEMPLATE_QSORT_ARG_HEAP(float4, float4_shim);
+ TEMPLATE_QSORT_ARG_HEAP(float8, float8_shim);
+ 
+ /*
+  * A type-neutral specialization, for single sort-key sorts not covered by other
+  * specializations. This specialization could be used for sorting both heap and
+  * index tuples that meet that criteria. However, it is currently only used for
+  * heap tuples.
+  *
+  * This could result in dead code, but since the redundant functions generated
+  * are all static, they will almost invariably be discarded by compiler
+  * optimizations performed by all modern compilers.
+  */
+ DO_TEMPLATE_QSORT_ARG(all, noop_cmp_proper, sing, noop_code, comparetup_inline);
  
  /*
   *		tuplesort_begin_xxx
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 631,636 ****
--- 671,704 ----
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
+ 	if (state->sortKeys)
+ 	{
+ 		switch(state->sortKeys->usable_compar)
+ 		{
+ 			case TYPE_COMP_INT4:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?int4_inlheap_qsort_arg:int4_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_INT8:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?int8_inlheap_qsort_arg:int8_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_FLOAT4:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?float4_inlheap_qsort_arg:float4_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_FLOAT8:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?float8_inlheap_qsort_arg:float8_regheap_qsort_arg;
+ 				break;
+ 			case TYPE_COMP_OTHER:
+ 				state->sortKeys->qsort_arg_spec =
+ 					nkeys==1?all_sing_qsort_arg:NULL;
+ 				break;
+ 			default:
+ 				elog(ERROR, "unrecognized comparator type for heap tuplesort");
+ 		}
+ 	}
  	MemoryContextSwitchTo(oldcontext);
  
  	return state;
*************** tuplesort_performsort(Tuplesortstate *st
*** 1221,1232 ****
  			 * We were able to accumulate all the tuples within the allowed
  			 * amount of memory.  Just qsort 'em and we're done.
  			 */
  			if (state->memtupcount > 1)
! 				qsort_arg((void *) state->memtuples,
! 						  state->memtupcount,
! 						  sizeof(SortTuple),
! 						  (qsort_arg_comparator) state->comparetup,
! 						  (void *) state);
  			state->current = 0;
  			state->eof_reached = false;
  			state->markpos_offset = 0;
--- 1289,1317 ----
  			 * We were able to accumulate all the tuples within the allowed
  			 * amount of memory.  Just qsort 'em and we're done.
  			 */
+ 
  			if (state->memtupcount > 1)
! 			{
! 				/* Use a sorting specialization if available */
! 				if (state->sortKeys && state->sortKeys->qsort_arg_spec)
! 					/* specialization available */
! 					state->sortKeys->qsort_arg_spec(
! 								(void *) state->memtuples,
! 								state->memtupcount,
! 								sizeof(SortTuple),
! 								(void *) state);
! 				else
! 					/*
! 					 * Fall back on regular qsort_arg, with function pointer
! 					 * comparator, making no compile-time assumptions about the
! 					 * number of sortkeys or the datatype(s) to be sorted.
! 					 */
! 					qsort_arg((void *) state->memtuples,
! 								  state->memtupcount,
! 								  sizeof(SortTuple),
! 								  (qsort_arg_comparator) state->comparetup,
! 								  (void *) state);
! 			}
  			state->current = 0;
  			state->eof_reached = false;
  			state->markpos_offset = 0;
*************** inlineApplySortFunction(FmgrInfo *sortFu
*** 2645,2653 ****
--- 2730,2766 ----
  	return compare;
  }
  
+ /*
+  * This is a cut-down duplicate of comparetup_heap that exists for the express
+  * purpose of generating a sorting specialization in which it is inlined, as a
+  * performance optimization. It does not make any assumptions about the
+  * representation of the tuple proper (it could equally well be a heap or index
+  * tuple sort, though only heap tuples are currently supported).
+  *
+  * This is only possible for sorts with a single sortKey. Note that the
+  * comparator proper is not inlined, so this is only a partial specialization.
+  */
+ static inline int
+ comparetup_inline(void *a, void *b, Tuplesortstate *state)
+ {
+ 	/* void* parameters are used to shut-up the compiler */
+ 	const SortTuple *aT = (const SortTuple*) a, *bT = (const SortTuple*) b;
+ 	/* Allow interrupting long sorts */
+ 	CHECK_FOR_INTERRUPTS();
+ 
+ 	Assert(state->nKeys == 1);
+ 
+ 	/* Compare the one and only sort key with minimal indirection */
+ 	return ApplySortComparator(aT->datum1, aT->isnull1,
+ 								  bT->datum1, bT->isnull1,
+ 								  state->sortKeys);
+ }
  
  /*
   * Routines specialized for HeapTuple (actually MinimalTuple) case
+  *
+  * This code is partially duplicated within template_qsort_arg.h. Ensure that
+  * they are kept consistent.
   */
  
  static int
*************** comparetup_index_btree(const SortTuple *
*** 2981,2990 ****
  					   Tuplesortstate *state)
  {
  	/*
! 	 * This is similar to _bt_tuplecompare(), but we have already done the
! 	 * index_getattr calls for the first column, and we need to keep track of
! 	 * whether any null fields are present.  Also see the special treatment
! 	 * for equal keys at the end.
  	 */
  	ScanKey		scanKey = state->indexScanKey;
  	IndexTuple	tuple1;
--- 3094,3102 ----
  					   Tuplesortstate *state)
  {
  	/*
! 	 * We have already done the index_getattr calls for the first column, and we
! 	 * need to keep track of whether any null fields are present.  Also see the
! 	 * special treatment for equal keys at the end.
  	 */
  	ScanKey		scanKey = state->indexScanKey;
  	IndexTuple	tuple1;
*************** comparetup_index_btree(const SortTuple *
*** 3046,3063 ****
  	 * they *must* get compared at some stage of the sort --- otherwise the
  	 * sort algorithm wouldn't have checked whether one must appear before the
  	 * other.
- 	 *
- 	 * Some rather brain-dead implementations of qsort will sometimes call the
- 	 * comparison routine to compare a value to itself.  (At this writing only
- 	 * QNX 4 is known to do such silly things; we don't support QNX anymore,
- 	 * but perhaps the behavior still exists elsewhere.)  Don't raise a bogus
- 	 * error in that case.
  	 */
! 	if (state->enforceUnique && !equal_hasnull && tuple1 != tuple2)
  	{
  		Datum		values[INDEX_MAX_KEYS];
  		bool		isnull[INDEX_MAX_KEYS];
  
  		index_deform_tuple(tuple1, tupDes, values, isnull);
  		ereport(ERROR,
  				(errcode(ERRCODE_UNIQUE_VIOLATION),
--- 3158,3178 ----
  	 * they *must* get compared at some stage of the sort --- otherwise the
  	 * sort algorithm wouldn't have checked whether one must appear before the
  	 * other.
  	 */
! 	if (state->enforceUnique && !equal_hasnull)
  	{
  		Datum		values[INDEX_MAX_KEYS];
  		bool		isnull[INDEX_MAX_KEYS];
  
+ 		/*
+ 		 * In the past, it was conceivable that we'd have to protect against
+ 		 * a comparison of a tuple to itself, because we used the system
+ 		 * qsort(), and as such had to assume the worst about the implementation's
+ 		 * bogosity. This is no longer the case, though this assertion serves to
+ 		 * prevent that problem from re-emerging.
+ 		 */
+ 		Assert(tuple1 != tuple2);
+ 
  		index_deform_tuple(tuple1, tupDes, values, isnull);
  		ereport(ERROR,
  				(errcode(ERRCODE_UNIQUE_VIOLATION),
*************** comparetup_index_btree(const SortTuple *
*** 3070,3078 ****
  
  	/*
  	 * If key values are equal, we sort on ItemPointer.  This does not affect
! 	 * validity of the finished index, but it offers cheap insurance against
! 	 * performance problems with bad qsort implementations that have trouble
! 	 * with large numbers of equal keys.
  	 */
  	{
  		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
--- 3185,3192 ----
  
  	/*
  	 * If key values are equal, we sort on ItemPointer.  This does not affect
! 	 * validity of the finished index, but it may be useful to have index scans
! 	 * in physical order.
  	 */
  	{
  		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
*************** comparetup_index_hash(const SortTuple *a
*** 3120,3128 ****
  
  	/*
  	 * If hash values are equal, we sort on ItemPointer.  This does not affect
! 	 * validity of the finished index, but it offers cheap insurance against
! 	 * performance problems with bad qsort implementations that have trouble
! 	 * with large numbers of equal keys.
  	 */
  	tuple1 = (IndexTuple) a->tuple;
  	tuple2 = (IndexTuple) b->tuple;
--- 3234,3241 ----
  
  	/*
  	 * If hash values are equal, we sort on ItemPointer.  This does not affect
! 	 * validity of the finished index, but it may be useful to have index scans
! 	 * in physical order.
  	 */
  	tuple1 = (IndexTuple) a->tuple;
  	tuple2 = (IndexTuple) b->tuple;
diff --git a/src/include/c.h b/src/include/c.h
new file mode 100644
index 7396adb..939f49a
*** a/src/include/c.h
--- b/src/include/c.h
*************** extern int	fdatasync(int fildes);
*** 850,853 ****
--- 850,865 ----
  /* /port compatibility functions */
  #include "port.h"
  
+ /*
+  * Define a cross-platform "always-inline" macro. This is a very sharp tool that
+  * should be used judiciously.
+  */
+ #ifdef __always_inline
+ #define pg_always_inline __always_inline
+ #elif defined(__force_inline)
+ #define pg_always_inline __force_inline
+ #else
+ #define pg_always_inline inline
+ #endif
+ 
  #endif   /* C_H */
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
new file mode 100644
index 46b2f3b..c8c678c
*** a/src/include/utils/builtins.h
--- b/src/include/utils/builtins.h
***************
*** 14,21 ****
--- 14,24 ----
  #ifndef BUILTINS_H
  #define BUILTINS_H
  
+ #include <math.h>
+ 
  #include "fmgr.h"
  #include "nodes/parsenodes.h"
+ #include "utils/sortsupport.h"
  
  /*
   *		Defined in adt/
*************** extern Datum btfloat8sortsupport(PG_FUNC
*** 323,328 ****
--- 326,459 ----
  extern Datum btoidsortsupport(PG_FUNCTION_ARGS);
  extern Datum btnamesortsupport(PG_FUNCTION_ARGS);
  
+ /*
+  *		Expose some "fast" variants of per-opclass comparison functions that
+  *		will be returned by some of the above sort-support functions. Do so to
+  *		avail of optimizations from compile-time knowledge of comparators,
+  *		including but not limited to inlining, both for its innate value in
+  *		improving locality of reference, and its value as an enabling
+  *		transformation.
+  */
+ 
+ #ifdef USE_INLINE
+ 
+ static inline int
+ btint4fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	int32		a = DatumGetInt32(x);
+ 	int32		b = DatumGetInt32(y);
+ 
+ 	if (a > b)
+ 		return 1;
+ 	else if (a == b)
+ 		return 0;
+ 	else
+ 		return -1;
+ }
+ 
+ static inline int
+ btint8fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	int64		a = DatumGetInt64(x);
+ 	int64		b = DatumGetInt64(y);
+ 
+ 	if (a > b)
+ 		return 1;
+ 	else if (a == b)
+ 		return 0;
+ 	else
+ 		return -1;
+ }
+ 
+ static inline int
+ float4_cmp_internal(float4 a, float4 b)
+ {
+ 	/*
+ 	 * We consider all NANs to be equal and larger than any non-NAN. This is
+ 	 * somewhat arbitrary; the important thing is to have a consistent sort
+ 	 * order.
+ 	 */
+ 	if (isnan(a))
+ 	{
+ 		if (isnan(b))
+ 			return 0;			/* NAN = NAN */
+ 		else
+ 			return 1;			/* NAN > non-NAN */
+ 	}
+ 	else if (isnan(b))
+ 	{
+ 		return -1;				/* non-NAN < NAN */
+ 	}
+ 	else
+ 	{
+ 		if (a > b)
+ 			return 1;
+ 		else if (a < b)
+ 			return -1;
+ 		else
+ 			return 0;
+ 	}
+ }
+ 
+ static inline int
+ float8_cmp_internal(float8 a, float8 b)
+ {
+ 	/*
+ 	 * We consider all NANs to be equal and larger than any non-NAN. This is
+ 	 * somewhat arbitrary; the important thing is to have a consistent sort
+ 	 * order.
+ 	 */
+ 	if (isnan(a))
+ 	{
+ 		if (isnan(b))
+ 			return 0;			/* NAN = NAN */
+ 		else
+ 			return 1;			/* NAN > non-NAN */
+ 	}
+ 	else if (isnan(b))
+ 	{
+ 		return -1;				/* non-NAN < NAN */
+ 	}
+ 	else
+ 	{
+ 		if (a > b)
+ 			return 1;
+ 		else if (a < b)
+ 			return -1;
+ 		else
+ 			return 0;
+ 	}
+ }
+ 
+ static inline int
+ btfloat4fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	float4		arg1 = DatumGetFloat4(x);
+ 	float4		arg2 = DatumGetFloat4(y);
+ 
+ 	return float4_cmp_internal(arg1, arg2);
+ }
+ 
+ static inline int
+ btfloat8fastcmp(Datum x, Datum y, SortSupport ssup)
+ {
+ 	float8		arg1 = DatumGetFloat8(x);
+ 	float8		arg2 = DatumGetFloat8(y);
+ 
+ 	return float8_cmp_internal(arg1, arg2);
+ }
+ 
+ #else
+ 
+ extern int btint4fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int btint8fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int float4_cmp_internal(float4 a, float4 b);
+ extern int float8_cmp_internal(float8 a, float8 b);
+ extern int btfloat4fastcmp(Datum x, Datum y, SortSupport ssup);
+ extern int btfloat8fastcmp(Datum x, Datum y, SortSupport ssup);
+ 
+ #endif   /* USE_INLINE */
+ 
  /* float.c */
  extern PGDLLIMPORT int extra_float_digits;
  
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
new file mode 100644
index ef8d853..822ee6e
*** a/src/include/utils/sortsupport.h
--- b/src/include/utils/sortsupport.h
***************
*** 31,36 ****
--- 31,42 ----
   * data can be stored using the ssup_extra field.  Any such data
   * should be allocated in the ssup_cxt memory context.
   *
+  * In addition to providing lower-overhead comparators, this infrastructure also
+  * resolves a generic comparator for the type, if one is available from a static
+  * set, each of which corresponds to one or more types.  These are available
+  * only for a subset of built-in sort support functions.  This enables more
+  * aggressive per-type optimizations at sort time.
+  *
   * Note: since pg_amproc functions are indexed by (lefttype, righttype)
   * it is possible to associate a BTSORTSUPPORT function with a cross-type
   * comparison.  This could sensibly be used to provide a fast comparator
***************
*** 49,54 ****
--- 55,87 ----
  
  #include "access/attnum.h"
  
+ /*
+  * Which comparator representation can be used for this type? It is acceptable
+  * to use an int4 comparator with the date datatype for example, because both
+  * types have the same underlying representation. In particular, their
+  * comparators are interchangeable.
+  *
+  * It is incorrect to do this with types that share a certain bitwise
+  * representation with some scalar type, represented by an enum constant here,
+  * when they do not have interchangeable comparators. For example, sortkeys of
+  * legacy float8 representation of timestamps will not be set to
+  * TYPE_COMP_FLOAT8, because its comparator has special handling of NaN values.
+  *
+  * To an even greater extent than with the optimization whereby a btree index
+  * opclass provides a "fast" comparator to elide SQL function call overhead,
+  * it's not useful to do this with types that have inherently high-overhead
+  * comparisons, of greater than a few instructions; the cost of the comparison
+  * itself is expected to dominate, marginalizing any benefit.
+  */
+ typedef enum TypeCompar
+ {
+ 	TYPE_COMP_INT4,
+ 	TYPE_COMP_INT8,
+ 	TYPE_COMP_FLOAT4,
+ 	TYPE_COMP_FLOAT8,
+ 	TYPE_COMP_OTHER
+ } TypeCompar;
+ 
  typedef struct SortSupportData *SortSupport;
  
  typedef struct SortSupportData
*************** typedef struct SortSupportData
*** 96,103 ****
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * Additional sort-acceleration functions might be added here later.
  	 */
  } SortSupportData;
  
  
--- 129,157 ----
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * This specialization function pointer is sometimes used as an alternative
! 	 * to the standard qsort_arg, when it has been determined that we can
! 	 * benefit from various per-type and per number of sort key performance
! 	 * optimizations.
! 	 *
! 	 * Often, this will simply entail using a variant that inlines the comparator
! 	 * that was previously used (through a pointer) by the highly generic
! 	 * qsort_arg, such as comparetup_heap, which encapsulate the details of
! 	 * performing a comparison on the class of tuple in question, and are where
! 	 * the call is made to MySortSupport->comparator.
! 	 *
! 	 * In some other cases, particularly with scalar datatypes that are assumed
! 	 * to be sorted far more frequently in practice, the specialization goes so
! 	 * far as to inline the comparator proper from within the tuple-class
! 	 * encapsulating comparator.
! 	 */
! 	void (*qsort_arg_spec)(void *a, size_t n, size_t es, void *arg);
! 
! 	/*
! 	 * Which type-specific comparator proper (which is part of a fully inlined
! 	 * specialization) can be safely used, if any?
  	 */
+ 	TypeCompar usable_compar;
  } SortSupportData;
  
  
*************** extern int	ApplySortComparator(Datum dat
*** 151,156 ****
--- 205,211 ----
  
  /* Other functions in utils/sort/sortsupport.c */
  extern void PrepareSortSupportComparisonShim(Oid cmpFunc, SortSupport ssup);
+ extern TypeCompar ResolveComparatorProper(Oid orderingOp);
  extern void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup);
  
  #endif   /* SORTSUPPORT_H */
diff --git a/src/include/utils/template_qsort_arg.h b/src/include/utils/template_qsort_arg.h
new file mode 100644
index ...6e9ef83
*** a/src/include/utils/template_qsort_arg.h
--- b/src/include/utils/template_qsort_arg.h
***************
*** 0 ****
--- 1,316 ----
+ /*-------------------------------------------------------------------------
+  *	template_qsort_arg.h: "template" version of qsort_arg.c
+  *
+  *	This version of qsort_arg is exclusively used within tuplesort.c to	more
+  *	efficiently sort common types such as integers and floats. In providing this
+  *	version, we seek to take advantage of compile-time	optimizations, in
+  *	particular, the inlining of comparators. In some cases the "tuple class"
+  *	encapsulating comparator (that is, the particular comparator used directly
+  *	here) is just inlined, when a full specialization is unavailable. In other
+  *	cases, the comparator proper is also inlined, for full integration of the
+  *	sort routine. There are variant specializations for cases with only a single
+  *	sortkey, and cases with multiple sortkeys.
+  *
+  *	The TEMPLATE_QSORT_ARG() macro generates an inlining variant (for sorts with
+  *	a single sortKey) and non-inlining variant for sorts with multiple sortKeys.
+  *
+  *	We rely on various function declarations, and indeed partially duplicate
+  *	some code from tuplesort.c, so this file should be considered private to
+  *	that module, rather than a generic piece of infrastructure.
+  *
+  *	CAUTION: if you change this file, see also qsort_arg.c as well as qsort.c.
+  *	qsort_arg.c should be considered authoritative.
+  *
+  *	Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+  *	Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *	src/include/utils/template_qsort_arg.h
+  *-------------------------------------------------------------------------
+  */
+ #include "c.h"
+ 
+ #define swapcode(TYPE, parmi, parmj, n)		\
+ do {										\
+ 	size_t i = (n) / sizeof (TYPE);			\
+ 	TYPE *pi = (TYPE *)(void *)(parmi);		\
+ 	TYPE *pj = (TYPE *)(void *)(parmj);		\
+ 	do {									\
+ 		TYPE	t = *pi;					\
+ 		*pi++ = *pj;						\
+ 		*pj++ = t;							\
+ 		} while (--i > 0);					\
+ } while (0)
+ 
+ #define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
+ 	(es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
+ 
+ #define thistype_vecswap(a, b, n)								\
+ 	if ((n) > 0) inl_swapfunc((a), (b), (size_t)(n), swaptype)
+ 
+ #define thistype_swap(a, b)								\
+ if (swaptype == 0) {									\
+ 		long t = *(long *)(void *)(a);					\
+ 		*(long *)(void *)(a) = *(long *)(void *)(b);	\
+ 		*(long *)(void *)(b) = t;						\
+ 	} else												\
+ 		inl_swapfunc(a, b, es, swaptype)
+ 
+ inline static void
+ inl_swapfunc(char *a, char *b, size_t n, int swaptype)
+ {
+ 	if (swaptype <= 1)
+ 		swapcode(long, a, b, n);
+ 	else
+ 		swapcode(char, a, b, n);
+ }
+ 
+ /*
+  * This macro manufactures a type-specific implementation of qsort_arg with
+  * the comparator, COMPAR, known at compile time. COMPAR is typically an
+  * inline function.
+  *
+  * COMPAR should take as its arguments two Datums, and return an int, in
+  * line with standard qsort convention.
+  *
+  * We have void* parameters for TYPE##comparetup_inline just to shut up the compiler.
+  * They could be SortTuple pointers instead, but that would make it more
+  * difficult to keep template_qsort_arg.h consistent with tuplesort.c.
+  */
+ 
+ #define DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, SPEC_VAR, ADD_CODE, OUT_COMPAR)			\
+ void TYPE##_##SPEC_VAR##_qsort_arg(void *a, size_t n, size_t es, void *arg);		\
+ 																					\
+ inline static int32																	\
+ TYPE##SPEC_VAR##AppFunc(Datum datum1, bool isnull1, Datum datum2, bool isnull2,		\
+ 							SortSupport	sortKey)									\
+ {																					\
+ 	int32		compare;															\
+ 	if (isnull1)																	\
+ 	{																				\
+ 		if (isnull2)																\
+ 			compare = 0;		/* NULL "=" NULL */									\
+ 		else if (sortKey->ssup_nulls_first)											\
+ 			compare = -1;		/* NULL "<" NOT_NULL */								\
+ 		else																		\
+ 			compare = 1;		/* NULL ">" NOT_NULL */								\
+ 	}																				\
+ 	else if (isnull2)																\
+ 	{																				\
+ 		if (sortKey->ssup_nulls_first)												\
+ 			compare = 1;		/* NOT_NULL ">" NULL */								\
+ 		else																		\
+ 			compare = -1;		/* NOT_NULL "<" NULL */								\
+ 	}																				\
+ 	else																			\
+ 	{																				\
+ 		compare = COMPAR(datum1, datum2);											\
+ 																					\
+ 		if (sortKey->ssup_reverse)													\
+ 			compare = -compare;														\
+ 	}																				\
+ 	return compare;																	\
+ }																					\
+ 																					\
+ /*																					\
+  * Note that this is heavily based on comparetup_inline; the two should be kept		\
+  * consistent.																		\
+  */																					\
+ pg_always_inline static int															\
+ TYPE##SPEC_VAR##comparetup_inline(const void *a, const void *b,						\
+ 		Tuplesortstate *state)														\
+ {																					\
+ 	const SortTuple* aT = a;														\
+ 	const SortTuple* bT = b;														\
+ 	int32		compare;															\
+ 	SortSupport	sortKey = state->sortKeys;											\
+ 																					\
+ 	/* Allow interrupting long sorts */												\
+ 	CHECK_FOR_INTERRUPTS();															\
+ 	compare = TYPE##SPEC_VAR##AppFunc(aT->datum1, aT->isnull1, bT->datum1,			\
+ 										bT->isnull1, sortKey);						\
+ 	if (compare != 0)																\
+ 		return compare;																\
+ 	/* Additional code for variants with more than one sortkey */					\
+ 	ADD_CODE																		\
+ 	return 0;																		\
+ }																					\
+ 																					\
+ inline static char *											\
+ TYPE##SPEC_VAR##med3(char *a, char *b, char *c, void *arg)		\
+ {																\
+ 	return OUT_COMPAR(a, b, arg) < 0 ?							\
+ 		(OUT_COMPAR(b, c, arg) < 0 ?							\
+ 					b : (OUT_COMPAR(a, c, arg) < 0 ? c : a))	\
+ 		: (OUT_COMPAR(b, c, arg) > 0 ?							\
+ 					b : (OUT_COMPAR(a, c, arg) < 0 ? a : c));	\
+ }																\
+ 																\
+ void																		\
+ TYPE##_##SPEC_VAR##_qsort_arg(void *a, size_t n, size_t es, void *arg)		\
+ {																			\
+ 	char	   *pa,															\
+ 			   *pb,															\
+ 			   *pc,															\
+ 			   *pd,															\
+ 			   *pl,															\
+ 			   *pm,															\
+ 			   *pn;															\
+ 	int			d,															\
+ 				r,															\
+ 				swaptype,													\
+ 				presorted;													\
+ 																			\
+ loop:SWAPINIT(a, es);														\
+ 	if (n < 7)																\
+ 	{																		\
+ 		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)		\
+ 			for (pl = pm; pl > (char *) a &&								\
+ 					OUT_COMPAR(pl - es, pl, arg) > 0;						\
+ 				 pl -= es)													\
+ 				thistype_swap(pl, pl - es);									\
+ 		return;																\
+ 	}																		\
+ 	presorted = 1;															\
+ 	for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)			\
+ 	{																		\
+ 		if (OUT_COMPAR(pm - es, pm, arg) > 0)								\
+ 		{																	\
+ 			presorted = 0;													\
+ 			break;															\
+ 		}																	\
+ 	}																		\
+ 	if (presorted)															\
+ 		return;																\
+ 	pm = (char *) a + (n / 2) * es;											\
+ 	if (n > 7)																\
+ 	{																		\
+ 		pl = (char *) a;													\
+ 		pn = (char *) a + (n - 1) * es;										\
+ 		if (n > 40)															\
+ 		{																	\
+ 			d = (n / 8) * es;												\
+ 			pl = TYPE##SPEC_VAR##med3(pl, pl + d, pl + 2 * d, arg);			\
+ 			pm = TYPE##SPEC_VAR##med3(pm - d, pm, pm + d, arg);				\
+ 			pn = TYPE##SPEC_VAR##med3(pn - 2 * d, pn - d, pn, arg);			\
+ 		}																	\
+ 		pm = TYPE##SPEC_VAR##med3(pl, pm, pn, arg);							\
+ 	}																		\
+ 	thistype_swap(a, pm);													\
+ 	pa = pb = (char *) a + es;												\
+ 	pc = pd = (char *) a + (n - 1) * es;									\
+ 	for (;;)																\
+ 	{																		\
+ 		while (pb <= pc &&													\
+ 					(r = OUT_COMPAR(pb, a, arg)) <= 0)						\
+ 		{																	\
+ 			if (r == 0)														\
+ 			{																\
+ 				thistype_swap(pa, pb);										\
+ 				pa += es;													\
+ 			}																\
+ 			pb += es;														\
+ 		}																	\
+ 		while (pb <= pc &&													\
+ 				(r = OUT_COMPAR(pc, a, arg)) >= 0)							\
+ 		{																	\
+ 			if (r == 0)														\
+ 			{																\
+ 				thistype_swap(pc, pd);										\
+ 				pd -= es;													\
+ 			}																\
+ 			pc -= es;														\
+ 		}																	\
+ 		if (pb > pc)														\
+ 			break;															\
+ 		thistype_swap(pb, pc);												\
+ 		pb += es;															\
+ 		pc -= es;															\
+ 	}																		\
+ 	pn = (char *) a + n * es;												\
+ 	r = Min(pa - (char *) a, pb - pa);										\
+ 	thistype_vecswap(a, pb - r, r);											\
+ 	r = Min(pd - pc, pn - pd - es);											\
+ 	thistype_vecswap(pb, pn - r, r);										\
+ 	if ((r = pb - pa) > es)													\
+ 		TYPE##_##SPEC_VAR##_qsort_arg(a, r / es, es, arg);					\
+ 	if ((r = pd - pc) > es)													\
+ 	{																		\
+ 		/* Iterate rather than recurse to save stack space */				\
+ 		a = pn - r;															\
+ 		n = r / es;															\
+ 		goto loop;															\
+ 	}																		\
+ }
+ 
+ /*
+  * This code becomes part of the comparator meta-function for the "reg"
+  * specialization variant of each datatype-specific specialization.
+  *
+  * Note that this is heavily based on tuplesort_comparetup_heap; the two should
+  * be kept consistent.
+  *
+  * We can handle multiple sortKeys, but the function generally will not be
+  * inlined directly when sorting. We'll try and use compile time knowledge of
+  * the comparator for later sortKeys, on the basis that that has been determined
+  * to be a win frequently enough to justify the overhead.
+  *
+  * Modern compilers are not inclined to pay too much attention to the inline
+  * keyword, and indeed inline at the callsite granularity, rather than the
+  * function granularity. It should not be assumed that the second call to the
+  * "inline" comparator here will result in a second copy of the comparator.
+  */
+ 
+ #define MULT_ADDITIONAL_CODE(COMPAR)										\
+ {																			\
+ 	HeapTupleData ltup;														\
+ 	HeapTupleData rtup;														\
+ 	TupleDesc	tupDesc;													\
+ 	int			nkey;														\
+ 																			\
+ 	/* Compare additional sort keys */										\
+ 	ltup.t_len =															\
+ 		((MinimalTuple) aT->tuple)->t_len + MINIMAL_TUPLE_OFFSET;			\
+ 	ltup.t_data =															\
+ 		(HeapTupleHeader) ((char *) aT->tuple - MINIMAL_TUPLE_OFFSET);		\
+ 	rtup.t_len =															\
+ 		((MinimalTuple) bT->tuple)->t_len + MINIMAL_TUPLE_OFFSET;			\
+ 	rtup.t_data =															\
+ 		(HeapTupleHeader) ((char *) bT->tuple - MINIMAL_TUPLE_OFFSET);		\
+ 	tupDesc = state->tupDesc;												\
+ 	sortKey++;																\
+ 	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)					\
+ 	{																		\
+ 		AttrNumber	attno = sortKey->ssup_attno;							\
+ 		Datum		datum1,													\
+ 					datum2;													\
+ 		bool		isnull1,												\
+ 					isnull2;												\
+ 																			\
+ 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);				\
+ 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);				\
+ 																			\
+ 		if (sortKey->usable_compar == state->sortKeys->usable_compar)		\
+ 			compare = COMPAR(datum1, isnull1,								\
+ 								datum2, isnull2,							\
+ 								sortKey);									\
+ 		else																\
+ 			compare = ApplySortComparator(datum1, isnull1,					\
+ 										  datum2, isnull2,					\
+ 										  sortKey);							\
+ 		if (compare != 0)													\
+ 			return compare;													\
+ 	}																		\
+ }
+ 
+ #define SING_ADDITIONAL_CODE Assert(state->nKeys == 1);
+ 
+ /*
+  * Manufacture inlining variant for nKeys=1 case, and non-inlining variant
+  * for nKeys > 1 case
+  */
+ #define TEMPLATE_QSORT_ARG_HEAP(TYPE, COMPAR)								\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, inlheap,								\
+ 		SING_ADDITIONAL_CODE, TYPE##inlheapcomparetup_inline)				\
+ DO_TEMPLATE_QSORT_ARG(TYPE, COMPAR, regheap,								\
+ 		MULT_ADDITIONAL_CODE(TYPE##regheapAppFunc),							\
+ 			TYPE##regheapcomparetup_inline)
diff --git a/src/port/qsort.c b/src/port/qsort.c
new file mode 100644
index 8e2c6d9..d1981d6
*** a/src/port/qsort.c
--- b/src/port/qsort.c
***************
*** 7,13 ****
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort_arg.c
   *
   *	src/port/qsort.c
   */
--- 7,14 ----
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort_arg.c and
!  *	template_qsort_arg.h
   *
   *	src/port/qsort.c
   */
diff --git a/src/port/qsort_arg.c b/src/port/qsort_arg.c
new file mode 100644
index 28d1894..0ab6198
*** a/src/port/qsort_arg.c
--- b/src/port/qsort_arg.c
***************
*** 7,13 ****
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort.c
   *
   *	src/port/qsort_arg.c
   */
--- 7,13 ----
   *	  Remove ill-considered "swap_cnt" switch to insertion sort,
   *	  in favor of a simple check for presorted input.
   *
!  *	CAUTION: if you change this file, see also qsort.c and template_qsort_arg.h
   *
   *	src/port/qsort_arg.c
   */
