From ed1cf0675470c3c198ff140f71f3a40adcd4d02a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Sun, 12 Jul 2015 13:14:01 -0700
Subject: [PATCH] Prefetch from memtuples array in tuplesort

Testing shows that prefetching the "tuple proper" of a slightly later
SortTuple in the memtuples array during each of many sequential,
in-logical-order SortTuple fetches speeds up various sorting intense
operations considerably.  For example, B-Tree index builds are
accelerated as leaf pages are created from the memtuples array.
(i.e.  The operation following actually "performing" the sort, but
before a tuplesort_end() call is made as a B-Tree spool is destroyed.)

Similarly, ordered set aggregates (all cases except the datumsort case
with a pass-by-value type), and regular heap tuplesorts benefit to about
the same degree.  The optimization is only used when sorts fit in
memory, though.

Also, prefetch a few places ahead within the analogous "fetching" point
in tuplestore.c.  This appears to offer similar benefits in certain
cases.  For example, queries involving large common table expressions
significantly benefit.
---
 config/c-compiler.m4                | 17 +++++++++++++++++
 configure                           | 31 +++++++++++++++++++++++++++++++
 configure.in                        |  1 +
 src/backend/utils/sort/tuplesort.c  | 21 +++++++++++++++++++++
 src/backend/utils/sort/tuplestore.c | 13 +++++++++++++
 src/include/c.h                     | 14 ++++++++++++++
 src/include/pg_config.h.in          |  3 +++
 src/include/pg_config.h.win32       |  3 +++
 src/include/pg_config_manual.h      | 10 ++++++++++
 9 files changed, 113 insertions(+)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 550d034..8be2122 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -271,6 +271,23 @@ fi])# PGAC_C_BUILTIN_UNREACHABLE
 
 
 
+# PGAC_C_BUILTIN_PREFETCH
+# -------------------------
+# Check if the C compiler understands __builtin_prefetch(),
+# and define HAVE__BUILTIN_PREFETCH if so.
+AC_DEFUN([PGAC_C_BUILTIN_PREFETCH],
+[AC_CACHE_CHECK(for __builtin_prefetch, pgac_cv__builtin_prefetch,
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+[int i = 0;__builtin_prefetch(&i, 0, 3);])],
+[pgac_cv__builtin_prefetch=yes],
+[pgac_cv__builtin_prefetch=no])])
+if test x"$pgac_cv__builtin_prefetch" = xyes ; then
+AC_DEFINE(HAVE__BUILTIN_PREFETCH, 1,
+          [Define to 1 if your compiler understands __builtin_prefetch.])
+fi])# PGAC_C_BUILTIN_PREFETCH
+
+
+
 # PGAC_C_VA_ARGS
 # --------------
 # Check if the C compiler understands C99-style variadic macros,
diff --git a/configure b/configure
index b771a83..420eb2c 100755
--- a/configure
+++ b/configure
@@ -11338,6 +11338,37 @@ if test x"$pgac_cv__builtin_unreachable" = xyes ; then
 $as_echo "#define HAVE__BUILTIN_UNREACHABLE 1" >>confdefs.h
 
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_prefetch" >&5
+$as_echo_n "checking for __builtin_prefetch... " >&6; }
+if ${pgac_cv__builtin_prefetch+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+int i = 0;__builtin_prefetch(&i, 0, 3);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__builtin_prefetch=yes
+else
+  pgac_cv__builtin_prefetch=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_prefetch" >&5
+$as_echo "$pgac_cv__builtin_prefetch" >&6; }
+if test x"$pgac_cv__builtin_prefetch" = xyes ; then
+
+$as_echo "#define HAVE__BUILTIN_PREFETCH 1" >>confdefs.h
+
+fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __VA_ARGS__" >&5
 $as_echo_n "checking for __VA_ARGS__... " >&6; }
 if ${pgac_cv__va_args+:} false; then :
diff --git a/configure.in b/configure.in
index b5868b0..0e25b89 100644
--- a/configure.in
+++ b/configure.in
@@ -1320,6 +1320,7 @@ PGAC_C_BUILTIN_BSWAP32
 PGAC_C_BUILTIN_BSWAP64
 PGAC_C_BUILTIN_CONSTANT_P
 PGAC_C_BUILTIN_UNREACHABLE
+PGAC_C_BUILTIN_PREFETCH
 PGAC_C_VA_ARGS
 PGAC_STRUCT_TIMEZONE
 PGAC_UNION_SEMUN
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d532e87..7ed0d1f 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -1670,6 +1670,27 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				if (state->current < state->memtupcount)
 				{
 					*stup = state->memtuples[state->current++];
+
+					/*
+					 * Perform memory prefetch of "tuple proper" of the
+					 * SortTuple that's three places ahead of current
+					 * (which is returned to caller).  Testing shows that
+					 * this significantly boosts the performance for
+					 * TSS_SORTEDINMEM "forward" callers by hiding memory
+					 * latency behind their processing of returned tuples.
+					 *
+					 * Don't do this for pass-by-value datum sorts;  even
+					 * though hinting a NULL address does not affect
+					 * correctness, it would have a noticeable overhead
+					 * here.
+					 */
+#ifdef USE_MEM_PREFETCH
+					if (stup->tuple != NULL &&
+						state->current + 2 < state->memtupcount)
+						pg_read_prefetch(
+							state->memtuples[state->current + 2].tuple);
+#endif
+
 					return true;
 				}
 				state->eof_reached = true;
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index 51f474d..e9cb599 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -902,6 +902,19 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
 					return NULL;
 				if (readptr->current < state->memtupcount)
 				{
+					/*
+					 * Perform memory prefetch of tuple that's three places
+					 * ahead of current (which is returned to caller).
+					 * Testing shows that this significantly boosts the
+					 * performance for TSS_INMEM "forward" callers by
+					 * hiding memory latency behind their processing of
+					 * returned tuples.
+					 */
+#ifdef USE_MEM_PREFETCH
+					if (readptr->current + 3 < state->memtupcount)
+						pg_read_prefetch(state->memtuples[readptr->current + 3]);
+#endif
+
 					/* We have another tuple, so return it */
 					return state->memtuples[readptr->current++];
 				}
diff --git a/src/include/c.h b/src/include/c.h
index 8163b00..3d05ff8 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -927,6 +927,20 @@ typedef NameData *Name;
 #define pg_unreachable() abort()
 #endif
 
+/*
+ * Prefetch support -- Support memory prefetching hints on some platforms.
+ *
+ * pg_read_prefetch() is specialized for the case where an array is accessed
+ * sequentially, and we can prefetch a pointer within the next element (or an
+ * even later element) in order to hide memory latency.  This case involves
+ * prefetching addresses with low temporal locality.  Note that it's rather
+ * difficult to get any kind of speedup with this;  any use of the intrinsic
+ * should be carefully tested.  It's okay to pass it an invalid or NULL
+ * address, although it's best avoided.
+ */
+#if defined(USE_MEM_PREFETCH)
+#define pg_read_prefetch(addr)		__builtin_prefetch((addr), 0, 0)
+#endif
 
 /* ----------------------------------------------------------------
  *				Section 8:	random stuff
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index a20e0cd..ad1d17a 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -666,6 +666,9 @@
 /* Define to 1 if your compiler understands __builtin_constant_p. */
 #undef HAVE__BUILTIN_CONSTANT_P
 
+/* Define to 1 if your compiler understands __builtin_prefetch. */
+#undef HAVE__BUILTIN_PREFETCH
+
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 8566065..990c3c3 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -514,6 +514,9 @@
 /* Define to 1 if your compiler understands __builtin_constant_p. */
 /* #undef HAVE__BUILTIN_CONSTANT_P */
 
+/* Define to 1 if your compiler understands __builtin_prefetch. */
+#undef HAVE__BUILTIN_PREFETCH
+
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 /* #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P */
 
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index e278fa0..4c7b1d5 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -153,6 +153,16 @@
 #endif
 
 /*
+ * USE_MEM_PREFETCH controls whether Postgres will attempt to use memory
+ * prefetching.  Usually the automatic configure tests are sufficient, but
+ * it's conceivable that using prefetching is counter-productive on some
+ * platforms.  If necessary you can remove the #define here.
+ */
+#ifdef HAVE__BUILTIN_PREFETCH
+#define USE_MEM_PREFETCH
+#endif
+
+/*
  * USE_SSL code should be compiled only when compiling with an SSL
  * implementation.  (Currently, only OpenSSL is supported, but we might add
  * more implementations in the future.)
-- 
1.9.1

