From 8bf013c5d1bf52eab2c877a443324d3e46d18052 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 4 Feb 2025 15:20:13 -0800
Subject: [PATCH v3 3/4] Improve CRC32C performance on SSE4.2

The current SSE4.2 implementation of CRC32C relies on the native crc32
instruction and processes 8 bytes at a time in a loop. The technique in
this paper uses the pclmulqdq instruction and processing 64 bytes at
time.

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
V. Gopal, E. Ozturk, et al., 2009

The algorithm is based on crc32_sse42_simd from chromimum's copy of zlib. See:
from https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c

Microbenchmarks generated from standalone benchmarks:
see https://github.com/r-devulap/crc32c

Comparing scalar_crc32c (master) to sse42_crc32c (this branch):
|----------------------------------+------------------+---------------+---------------|
| Benchmark                        | buf size (bytes) | Time Old (ns) | Time New (ns) |
|----------------------------------+------------------+---------------+---------------|
| [scalar_crc32c vs. sse42_crc32c] | 64               | 5             | 4             |
| [scalar_crc32c vs. sse42_crc32c] | 128              | 8             | 6             |
| [scalar_crc32c vs. sse42_crc32c] | 256              | 19            | 10            |
| [scalar_crc32c vs. sse42_crc32c] | 512              | 50            | 18            |
| [scalar_crc32c vs. sse42_crc32c] | 1024             | 121           | 34            |
| [scalar_crc32c vs. sse42_crc32c] | 2048             | 259           | 70            |
|----------------------------------+------------------+---------------+---------------|
---
 config/c-compiler.m4              |   7 +-
 configure                         |   7 +-
 meson.build                       |   7 +-
 src/port/pg_crc32c_sse42.c        | 127 +++++++++++++++++++++++++++++-
 src/port/pg_crc32c_sse42_choose.c |   8 +-
 5 files changed, 146 insertions(+), 10 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8534cc54c1..8b255b5cc8 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -557,14 +557,19 @@ AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
 [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
 AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }],
diff --git a/configure b/configure
index 0ffcaeb436..3f2a2a515e 100755
--- a/configure
+++ b/configure
@@ -17059,14 +17059,19 @@ else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00);
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }
diff --git a/meson.build b/meson.build
index 1ceadb9a83..456c3fafc3 100644
--- a/meson.build
+++ b/meson.build
@@ -2227,15 +2227,18 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
 
     prog = '''
 #include <nmmintrin.h>
-
+#include <wmmintrin.h>
 #if defined(__has_attribute) && __has_attribute (target)
-__attribute__((target("sse4.2")))
+__attribute__((target("sse4.2,pclmul")))
 #endif
 int main(void)
 {
+    __m128i x1 = _mm_set1_epi32(1);
     unsigned int crc = 0;
     crc = _mm_crc32_u8(crc, 0);
     crc = _mm_crc32_u32(crc, 0);
+    x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+    crc = crc + _mm_extract_epi32(x1, 1);
     /* return computed value, to prevent the above being optimized away */
     return crc == 0;
 }
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df3..69f8917c7d 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,13 +15,13 @@
 #include "c.h"
 
 #include <nmmintrin.h>
-
+#include <wmmintrin.h>
 #include "port/pg_crc32c.h"
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
-pg_crc32c
-pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
+static pg_crc32c
+pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
@@ -68,3 +68,124 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+/*
+ * Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ * Instruction" V. Gopal, E. Ozturk, et al., 2009
+ *
+ * The algorithm is based on crc32_sse42_simd from chromimum's copy of zlib.
+ * See:
+ * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c
+ */
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2,pclmul")
+pg_crc32c
+pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
+{
+    ssize_t len = (ssize_t) length;
+    const unsigned char *buf = data;
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t pg_attribute_aligned(16) k1k2[] = { 0x740eef02, 0x9e4addf8 };
+    static const uint64_t pg_attribute_aligned(16) k3k4[] = { 0xf20c0dfe, 0x14cd00bd6 };
+    static const uint64_t pg_attribute_aligned(16) k5k0[] = { 0xdd45aab8, 0x000000000 };
+    static const uint64_t pg_attribute_aligned(16) poly[] = { 0x105ec76f1, 0xdea713f1 };
+    if (len >= 64) {
+        __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+        /*
+         * There's at least one block of 64.
+         */
+        x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+        x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+        x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+        x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+        x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+        x0 = _mm_load_si128((__m128i *)k1k2);
+        buf += 64;
+        len -= 64;
+        /*
+         * Parallel fold blocks of 64, if any.
+         */
+        while (len >= 64)
+        {
+            x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+            x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+            x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+            x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+            x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+            x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+            x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+            x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+            y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+            y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+            y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+            y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+            x1 = _mm_xor_si128(x1, x5);
+            x2 = _mm_xor_si128(x2, x6);
+            x3 = _mm_xor_si128(x3, x7);
+            x4 = _mm_xor_si128(x4, x8);
+            x1 = _mm_xor_si128(x1, y5);
+            x2 = _mm_xor_si128(x2, y6);
+            x3 = _mm_xor_si128(x3, y7);
+            x4 = _mm_xor_si128(x4, y8);
+            buf += 64;
+            len -= 64;
+        }
+        /*
+         * Fold into 128-bits.
+         */
+        x0 = _mm_load_si128((__m128i *)k3k4);
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x2);
+        x1 = _mm_xor_si128(x1, x5);
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x3);
+        x1 = _mm_xor_si128(x1, x5);
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x4);
+        x1 = _mm_xor_si128(x1, x5);
+        /*
+         * Single fold blocks of 16, if any.
+         */
+        while (len >= 16)
+        {
+            x2 = _mm_loadu_si128((__m128i *)buf);
+            x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+            x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+            x1 = _mm_xor_si128(x1, x2);
+            x1 = _mm_xor_si128(x1, x5);
+            buf += 16;
+            len -= 16;
+        }
+        /*
+         * Fold 128-bits to 64-bits.
+         */
+        x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+        x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+        x1 = _mm_srli_si128(x1, 8);
+        x1 = _mm_xor_si128(x1, x2);
+        x0 = _mm_loadl_epi64((__m128i*)k5k0);
+        x2 = _mm_srli_si128(x1, 4);
+        x1 = _mm_and_si128(x1, x3);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_xor_si128(x1, x2);
+        /*
+         * Barret reduce to 32-bits.
+         */
+        x0 = _mm_load_si128((__m128i*)poly);
+        x2 = _mm_and_si128(x1, x3);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+        x2 = _mm_and_si128(x2, x3);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+        x1 = _mm_xor_si128(x1, x2);
+        crc = _mm_extract_epi32(x1, 1);
+    }
+
+    return pg_comp_crc32c_sse42_tail(crc, buf, len);
+}
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d424..57e4ad5dfd 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -31,7 +31,7 @@
 #include "port/pg_crc32c.h"
 
 static bool
-pg_crc32c_sse42_available(void)
+pg_crc32c_sse42_pclmul_available(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
@@ -43,7 +43,9 @@ pg_crc32c_sse42_available(void)
 #error cpuid instruction not available
 #endif
 
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+    bool sse42 = (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+    bool pclmul = (exx[2] & (1 << 1)) != 0;	/* PCLMULQDQ */
+    return sse42 && pclmul;
 }
 
 /*
@@ -53,7 +55,7 @@ pg_crc32c_sse42_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
-	if (pg_crc32c_sse42_available())
+	if (pg_crc32c_sse42_pclmul_available())
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
-- 
2.43.0