>From 5bb63d463c17b68b022d5375b3e3a4ac643e086c Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <ams@2ndQuadrant.com>
Date: Tue, 30 Dec 2014 12:55:53 +0530
Subject: Use the SSE4.2 CRC instructions where available

We execute cpuid at startup to determine if the processor supports
SSE4.2 instructions, and use the crc32* instructions instead of the
default slice-by-8 code (both produce identical results).

Reviewers: Andres Freund, Heikki Linnakangas

Author: Abhijit Menon-Sen
---
 configure                     |  2 +-
 configure.in                  |  2 +-
 src/backend/main/main.c       |  7 ++++
 src/include/pg_config.h.in    |  3 ++
 src/include/pg_config.h.win32 |  3 ++
 src/include/utils/pg_crc.h    |  3 +-
 src/port/pg_crc.c             | 97 ++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index 27092ec..f76cf4b 100755
--- a/configure
+++ b/configure
@@ -9195,7 +9195,7 @@ fi
 done
 
 
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index 0206836..6005edc 100644
--- a/configure.in
+++ b/configure.in
@@ -1023,7 +1023,7 @@ AC_SUBST(UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 582198f..153a97d 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -37,6 +37,7 @@
 #include "utils/memutils.h"
 #include "utils/pg_locale.h"
 #include "utils/ps_status.h"
+#include "utils/pg_crc.h"
 
 
 const char *progname;
@@ -77,6 +78,12 @@ main(int argc, char *argv[])
 	argv = save_ps_display_args(argc, argv);
 
 	/*
+	 * Select the fastest available CRC32 implementation for the
+	 * platform.
+	 */
+	pg_choose_crc_impl();
+
+	/*
 	 * If supported on the current platform, set up a handler to be called if
 	 * the backend/postmaster crashes with a fatal signal or exception.
 	 */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 7962757..e12c4c9 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -99,6 +99,9 @@
 /* Define to 1 if you have the `class' function. */
 #undef HAVE_CLASS
 
+/* Define to 1 if you have the <cpuid.h> header file. */
+#undef HAVE_CPUID_H
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 18da922..e2a5e21 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -78,6 +78,9 @@
 /* Define to 1 if you have the `class' function. */
 /* #undef HAVE_CLASS */
 
+/* Define to 1 if you have the <cpuid.h> header file. */
+/* #undef HAVE_CPUID_H */
+
 /* Define to 1 if you have the `crypt' function. */
 /* #undef HAVE_CRYPT */
 
diff --git a/src/include/utils/pg_crc.h b/src/include/utils/pg_crc.h
index 4d35601..e5d9668 100644
--- a/src/include/utils/pg_crc.h
+++ b/src/include/utils/pg_crc.h
@@ -42,7 +42,8 @@
 typedef uint32 pg_crc32;
 
 extern uint32 bswap32(uint32 x);
-extern pg_crc32 pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len);
+extern void pg_choose_crc_impl(void);
+extern pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len);
 
 /*
  * CRC calculation using the CRC-32C (Castagnoli) polynomial.
diff --git a/src/port/pg_crc.c b/src/port/pg_crc.c
index b1bdd1e..e57945b 100644
--- a/src/port/pg_crc.c
+++ b/src/port/pg_crc.c
@@ -20,6 +20,13 @@
 #include "utils/pg_crc.h"
 #include "utils/pg_crc_tables.h"
 
+#if defined(HAVE_CPUID_H)
+#include <cpuid.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#include <nmmintrin.h>
+#endif
+
 #ifndef WORDS_BIGENDIAN
 #define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
 #else
@@ -51,8 +58,8 @@ bswap32(uint32 x)
  * pp. 1550-1560, November 2008, doi:10.1109/TC.2008.85
  */
 
-pg_crc32
-pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+static pg_crc32
+pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const uint32 *p8;
@@ -120,3 +127,89 @@ pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
 
 	return crc;
 }
+
+static inline pg_crc32
+pg_asm_crc32b(pg_crc32 crc, unsigned char data)
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	__asm__ ("crc32b %[data], %[crc]\n" : [crc] "+r" (crc) : [data] "rm" (data));
+	return crc;
+#elif defined(_MSC_VER)
+	return _mm_crc32_u8(crc, data);
+#else
+	/* Can't generate crc32b, but keep the compiler quiet. */
+	return 0;
+#endif
+}
+
+static inline pg_crc32
+pg_asm_crc32q(uint64 crc, unsigned long long data)
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	__asm__ ("crc32q %[data], %[crc]\n" : [crc] "+r" (crc) : [data] "rm" (data));
+	return crc;
+#elif defined(_MSC_VER)
+	return _mm_crc32_u64(crc, data);
+#else
+	/* Can't generate crc32q, but keep the compiler quiet. */
+	return 0;
+#endif
+}
+
+/*
+ * This function computes a CRC using the SSE4.2 CRC32B and CRC32Q
+ * instructions.
+ */
+
+static pg_crc32
+pg_comp_crc32c_sse(pg_crc32 crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const uint64 *p8;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 */
+
+	p8 = (const uint64 *) p;
+	while (len >= 8)
+	{
+		crc = pg_asm_crc32q(crc, *p8++);
+		len -= 8;
+	}
+
+	/*
+	 * Handle any remaining bytes one at a time.
+	 */
+
+	p = (const unsigned char *) p8;
+	while (len > 0)
+	{
+		crc = pg_asm_crc32b(crc, *p++);
+		len--;
+	}
+
+	return crc;
+}
+
+/*
+ * If (we can tell that) the CPU supports SSE4.2, we can use the CRC
+ * instructions, otherwise we fall back to slice-by-8 in software.
+ */
+
+void
+pg_choose_crc_impl(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(__GNUC__) && defined(HAVE_CPUID_H)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(_MSC_VER)
+	__cpuid(exx, 1);
+#endif
+
+	if (exx[2] & (1 << 20))
+		pg_comp_crc32c = pg_comp_crc32c_sse;
+}
+
+pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len) = pg_comp_crc32c_sb8;
-- 
1.9.1

