From acb63cddd8c8220db97ae0b012bf4f2fb5174e8a Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 17:07:49 +0700
Subject: [PATCH v4 5/5] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: John Naylor <johncnaylorls@gmail.com>
Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
---
 config/c-compiler.m4              | 7 ++++++-
 configure                         | 7 ++++++-
 meson.build                       | 7 +++++--
 src/port/pg_crc32c_sse42.c        | 4 ++++
 src/port/pg_crc32c_sse42_choose.c | 9 ++++++---
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8534cc54c1..8b255b5cc8 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -557,14 +557,19 @@ AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
 [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
 AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }],
diff --git a/configure b/configure
index 0ffcaeb436..3f2a2a515e 100755
--- a/configure
+++ b/configure
@@ -17059,14 +17059,19 @@ else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00);
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }
diff --git a/meson.build b/meson.build
index 1ceadb9a83..456c3fafc3 100644
--- a/meson.build
+++ b/meson.build
@@ -2227,15 +2227,18 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
 
     prog = '''
 #include <nmmintrin.h>
-
+#include <wmmintrin.h>
 #if defined(__has_attribute) && __has_attribute (target)
-__attribute__((target("sse4.2")))
+__attribute__((target("sse4.2,pclmul")))
 #endif
 int main(void)
 {
+    __m128i x1 = _mm_set1_epi32(1);
     unsigned int crc = 0;
     crc = _mm_crc32_u8(crc, 0);
     crc = _mm_crc32_u32(crc, 0);
+    x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+    crc = crc + _mm_extract_epi32(x1, 1);
     /* return computed value, to prevent the above being optimized away */
     return crc == 0;
 }
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7250eccf6b..05b11b47cb 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -3,6 +3,10 @@
  * pg_crc32c_sse42.c
  *	  Compute CRC-32C checksum using Intel SSE 4.2 instructions.
  *
+ * 	  For longer inputs, we use carryless multiplication on SIMD registers,
+ *	  based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ *	  Instruction" V. Gopal, E. Ozturk, et al., 2009
+ *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d424..95cfe63493 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -31,7 +31,7 @@
 #include "port/pg_crc32c.h"
 
 static bool
-pg_crc32c_sse42_available(void)
+pg_crc32c_sse42_pclmul_available(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
@@ -43,7 +43,10 @@ pg_crc32c_sse42_available(void)
 #error cpuid instruction not available
 #endif
 
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+	bool		sse42 = (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+	bool		pclmul = (exx[2] & (1 << 1)) != 0;	/* PCLMULQDQ */
+
+	return sse42 && pclmul;
 }
 
 /*
@@ -53,7 +56,7 @@ pg_crc32c_sse42_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
-	if (pg_crc32c_sse42_available())
+	if (pg_crc32c_sse42_pclmul_available())
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
-- 
2.48.1

