From 221a0be949feeea8357fb1e2272325a6b3d97edd Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Sun, 12 Jul 2015 13:14:01 -0700
Subject: [PATCH 4/5] Prefetch from memtuples array in tuplesort

This patch is almost the same as a canonical version, which appears
here:  https://commitfest.postgresql.org/6/305/

This version adds some additional tricks specific to new cases for
external sorts.  Of particular interest here is the prefetching of each
"tuple proper" during writing of batches of tuples.  This is not
intended to be reviewed as part of the external sorting work, and is
provided only as a convenience to reviewers who would like to see where
prefetching can help with external sorts, too.

Original canonical version details:

Testing shows that prefetching the "tuple proper" of a slightly later
SortTuple in the memtuples array during each of many sequential,
in-logical-order SortTuple fetches speeds up various sorting intense
operations considerably.  For example, B-Tree index builds are
accelerated as leaf pages are created from the memtuples array.
(i.e.  The operation following actually "performing" the sort, but
before a tuplesort_end() call is made as a B-Tree spool is destroyed.)

Similarly, ordered set aggregates (all cases except the datumsort case
with a pass-by-value type), and regular heap tuplesorts benefit to about
the same degree.  The optimization is only used when sorts fit in
memory, though.

Also, prefetch a few places ahead within the analogous "fetching" point
in tuplestore.c.  This appears to offer similar benefits in certain
cases.  For example, queries involving large common table expressions
significantly benefit.
---
 config/c-compiler.m4                | 17 +++++++++++++++++
 configure                           | 31 ++++++++++++++++++++++++++++++
 configure.in                        |  1 +
 src/backend/utils/sort/tuplesort.c  | 38 +++++++++++++++++++++++++++++++++++++
 src/backend/utils/sort/tuplestore.c | 13 +++++++++++++
 src/include/c.h                     | 14 ++++++++++++++
 src/include/pg_config.h.in          |  3 +++
 src/include/pg_config.h.win32       |  3 +++
 src/include/pg_config_manual.h      | 10 ++++++++++
 9 files changed, 130 insertions(+)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 9feec0c..3516314 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -253,6 +253,23 @@ fi])# PGAC_C_BUILTIN_UNREACHABLE
 
 
 
+# PGAC_C_BUILTIN_PREFETCH
+# -------------------------
+# Check if the C compiler understands __builtin_prefetch(),
+# and define HAVE__BUILTIN_PREFETCH if so.
+AC_DEFUN([PGAC_C_BUILTIN_PREFETCH],
+[AC_CACHE_CHECK(for __builtin_prefetch, pgac_cv__builtin_prefetch,
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+[int i = 0;__builtin_prefetch(&i, 0, 3);])],
+[pgac_cv__builtin_prefetch=yes],
+[pgac_cv__builtin_prefetch=no])])
+if test x"$pgac_cv__builtin_prefetch" = xyes ; then
+AC_DEFINE(HAVE__BUILTIN_PREFETCH, 1,
+          [Define to 1 if your compiler understands __builtin_prefetch.])
+fi])# PGAC_C_BUILTIN_PREFETCH
+
+
+
 # PGAC_C_VA_ARGS
 # --------------
 # Check if the C compiler understands C99-style variadic macros,
diff --git a/configure b/configure
index 0bed81c..88a7a6d 100755
--- a/configure
+++ b/configure
@@ -11315,6 +11315,37 @@ if test x"$pgac_cv__builtin_unreachable" = xyes ; then
 $as_echo "#define HAVE__BUILTIN_UNREACHABLE 1" >>confdefs.h
 
 fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_prefetch" >&5
+$as_echo_n "checking for __builtin_prefetch... " >&6; }
+if ${pgac_cv__builtin_prefetch+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+int i = 0;__builtin_prefetch(&i, 0, 3);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__builtin_prefetch=yes
+else
+  pgac_cv__builtin_prefetch=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_prefetch" >&5
+$as_echo "$pgac_cv__builtin_prefetch" >&6; }
+if test x"$pgac_cv__builtin_prefetch" = xyes ; then
+
+$as_echo "#define HAVE__BUILTIN_PREFETCH 1" >>confdefs.h
+
+fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __VA_ARGS__" >&5
 $as_echo_n "checking for __VA_ARGS__... " >&6; }
 if ${pgac_cv__va_args+:} false; then :
diff --git a/configure.in b/configure.in
index a28f9dd..778dd61 100644
--- a/configure.in
+++ b/configure.in
@@ -1319,6 +1319,7 @@ PGAC_C_TYPES_COMPATIBLE
 PGAC_C_BUILTIN_BSWAP32
 PGAC_C_BUILTIN_CONSTANT_P
 PGAC_C_BUILTIN_UNREACHABLE
+PGAC_C_BUILTIN_PREFETCH
 PGAC_C_VA_ARGS
 PGAC_STRUCT_TIMEZONE
 PGAC_UNION_SEMUN
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index abbef6c..d828723 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -1797,6 +1797,27 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				if (state->current < state->memtupcount)
 				{
 					*stup = state->memtuples[state->current++];
+
+					/*
+					 * Perform memory prefetch of "tuple proper" of the
+					 * SortTuple that's three places ahead of current
+					 * (which is returned to caller).  Testing shows that
+					 * this significantly boosts the performance for
+					 * TSS_SORTEDINMEM "forward" callers by hiding memory
+					 * latency behind their processing of returned tuples.
+					 *
+					 * Don't do this for pass-by-value datum sorts;  even
+					 * though hinting a NULL address does not affect
+					 * correctness, it would have a noticeable overhead
+					 * here.
+					 */
+#ifdef USE_MEM_PREFETCH
+					if (stup->tuple != NULL &&
+						state->current + 2 < state->memtupcount)
+						pg_read_prefetch(
+							state->memtuples[state->current + 2].tuple);
+#endif
+
 					return true;
 				}
 				state->eof_reached = true;
@@ -2025,6 +2046,18 @@ just_memtuples:
 			if (state->current < state->memtupcount)
 			{
 				*stup = state->memtuples[state->current++];
+
+				/*
+				 * Once this point is reached, rationale for memory
+				 * prefetching is identical to TSS_SORTEDINMEM case.
+				 */
+#ifdef USE_MEM_PREFETCH
+				if (stup->tuple != NULL &&
+					state->current + 2 < state->memtupcount)
+					pg_read_prefetch(
+						state->memtuples[state->current + 2].tuple);
+#endif
+
 				return true;
 			}
 			state->eof_reached = true;
@@ -3162,6 +3195,11 @@ dumpbatch(Tuplesortstate *state, bool alltuples)
 		WRITETUP(state, state->tp_tapenum[state->destTape],
 				 &state->memtuples[i]);
 		state->memtupcount--;
+
+#ifdef USE_MEM_PREFETCH
+		if (state->memtuples[i].tuple != NULL && i + 2 < memtupwrite)
+			pg_read_prefetch(state->memtuples[i + 2].tuple);
+#endif
 	}
 	markrunend(state, state->tp_tapenum[state->destTape]);
 	state->tp_runs[state->destTape]++;
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index 51f474d..e9cb599 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -902,6 +902,19 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
 					return NULL;
 				if (readptr->current < state->memtupcount)
 				{
+					/*
+					 * Perform memory prefetch of tuple that's three places
+					 * ahead of current (which is returned to caller).
+					 * Testing shows that this significantly boosts the
+					 * performance for TSS_INMEM "forward" callers by
+					 * hiding memory latency behind their processing of
+					 * returned tuples.
+					 */
+#ifdef USE_MEM_PREFETCH
+					if (readptr->current + 3 < state->memtupcount)
+						pg_read_prefetch(state->memtuples[readptr->current + 3]);
+#endif
+
 					/* We have another tuple, so return it */
 					return state->memtuples[readptr->current++];
 				}
diff --git a/src/include/c.h b/src/include/c.h
index b719eb9..6dd6d44 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -932,6 +932,20 @@ typedef NameData *Name;
 #define pg_unreachable() abort()
 #endif
 
+/*
+ * Prefetch support -- Support memory prefetching hints on some platforms.
+ *
+ * pg_read_prefetch() is specialized for the case where an array is accessed
+ * sequentially, and we can prefetch a pointer within the next element (or an
+ * even later element) in order to hide memory latency.  This case involves
+ * prefetching addresses with low temporal locality.  Note that it's rather
+ * difficult to get any kind of speedup with this;  any use of the intrinsic
+ * should be carefully tested.  It's okay to pass it an invalid or NULL
+ * address, although it's best avoided.
+ */
+#if defined(USE_MEM_PREFETCH)
+#define pg_read_prefetch(addr)		__builtin_prefetch((addr), 0, 0)
+#endif
 
 /* ----------------------------------------------------------------
  *				Section 8:	random stuff
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 8873dcc..d9eda4b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -675,6 +675,9 @@
 /* Define to 1 if your compiler understands __builtin_constant_p. */
 #undef HAVE__BUILTIN_CONSTANT_P
 
+/* Define to 1 if your compiler understands __builtin_prefetch. */
+#undef HAVE__BUILTIN_PREFETCH
+
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index ad61392..a2f6eb3 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -523,6 +523,9 @@
 /* Define to 1 if your compiler understands __builtin_constant_p. */
 /* #undef HAVE__BUILTIN_CONSTANT_P */
 
+/* Define to 1 if your compiler understands __builtin_prefetch. */
+#undef HAVE__BUILTIN_PREFETCH
+
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 /* #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P */
 
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index e278fa0..4c7b1d5 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -153,6 +153,16 @@
 #endif
 
 /*
+ * USE_MEM_PREFETCH controls whether Postgres will attempt to use memory
+ * prefetching.  Usually the automatic configure tests are sufficient, but
+ * it's conceivable that using prefetching is counter-productive on some
+ * platforms.  If necessary you can remove the #define here.
+ */
+#ifdef HAVE__BUILTIN_PREFETCH
+#define USE_MEM_PREFETCH
+#endif
+
+/*
  * USE_SSL code should be compiled only when compiling with an SSL
  * implementation.  (Currently, only OpenSSL is supported, but we might add
  * more implementations in the future.)
-- 
1.9.1

