From d3ee691a067fb1f41d3e8e4377df1a67962cf5c7 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v6 3/3] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: John Naylor <johncnaylorls@gmail.com>
Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
---
 src/include/port/pg_cpu.h             |  1 +
 src/include/port/pg_crc32c.h          |  1 +
 src/port/pg_cpu.c                     |  3 +
 src/port/pg_crc32c_sse42.c            | 95 +++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42_choose.c     | 16 +++++
 src/test/regress/expected/strings.out | 24 +++++++
 src/test/regress/sql/strings.sql      |  4 ++
 7 files changed, 144 insertions(+)

diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index 45ce9d3c50..0d8137ebb3 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -16,6 +16,7 @@
 
 #define PGCPUCAP_INIT           (1 << 0)
 #define PGCPUCAP_CRC32C         (1 << 1)
+#define PGCPUCAP_CLMUL          (1 << 2)
 
 extern uint32 pg_cpucap;
 extern void pg_cpucap_initialize(void);
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index db155d690e..068a653605 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,7 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len)
 #endif
 
 extern bool pg_crc32c_sse42_available(void);
+extern bool pg_crc32c_pclmul_available(void);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
 #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
diff --git a/src/port/pg_cpu.c b/src/port/pg_cpu.c
index c948335743..52944b2d4e 100644
--- a/src/port/pg_cpu.c
+++ b/src/port/pg_cpu.c
@@ -31,6 +31,9 @@ pg_cpucap_crc32c(void)
 	if (pg_crc32c_sse42_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
 
+	if (pg_crc32c_pclmul_available())
+		pg_cpucap |= PGCPUCAP_CLMUL;
+
 #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 	if (pg_crc32c_armv8_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df3..66ddb7ec87 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,9 +15,19 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 
+static pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length);
+
+/* WIP: configure checks */
+#ifdef __x86_64__
+#define HAVE_PCLMUL_RUNTIME
+#endif
+
+#define PCLMUL_THRESHOLD 128
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
@@ -25,6 +35,17 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
+	const pg_crc32c orig_crc = crc; /* XXX not for commit */
+	const size_t orig_len = len;
+
+#ifdef HAVE_PCLMUL_RUNTIME
+	if (len >= PCLMUL_THRESHOLD && (pg_cpucap & PGCPUCAP_CLMUL))
+	{
+		crc = pg_comp_crc32c_pclmul(crc, data, len);
+		len %= 64;
+		p = pend - len;
+	}
+#endif
 
 	/*
 	 * Process eight bytes of data at a time.
@@ -66,5 +87,79 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 		p++;
 	}
 
+	 /* XXX not for commit */
+	Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, orig_len));
+
 	return crc;
 }
+
+#ifdef HAVE_PCLMUL_RUNTIME
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4 */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+pg_attribute_target("sse4.2,pclmul")
+static pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const unsigned char *buf = data;
+
+	if (len >= 64)
+	{
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		len -= 64;
+
+		/* Main loop. */
+		while (len >= 64)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+			len -= 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+	}
+
+	return crc0;
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index f4d3215bc5..59a5a31c00 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -40,3 +40,19 @@ pg_crc32c_sse42_available(void)
 
 	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
+
+bool
+pg_crc32c_pclmul_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 1)) != 0;	/* PCLMUL */
+}
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index b65bb2d536..662bd37ace 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2282,6 +2282,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
  419469235
 (1 row)
 
+SELECT crc32c(repeat('A', 80)::bytea);
+   crc32c   
+------------
+ 3799127650
+(1 row)
+
+SELECT crc32c(repeat('A', 127)::bytea);
+  crc32c   
+-----------
+ 291820082
+(1 row)
+
+SELECT crc32c(repeat('A', 128)::bytea);
+  crc32c   
+-----------
+ 816091258
+(1 row)
+
+SELECT crc32c(repeat('A', 129)::bytea);
+   crc32c   
+------------
+ 4213642571
+(1 row)
+
 --
 -- encode/decode
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 8e0f3a0e75..26f86dc92e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -727,6 +727,10 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
 
 SELECT crc32c('');
 SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+SELECT crc32c(repeat('A', 80)::bytea);
+SELECT crc32c(repeat('A', 127)::bytea);
+SELECT crc32c(repeat('A', 128)::bytea);
+SELECT crc32c(repeat('A', 129)::bytea);
 
 --
 -- encode/decode
-- 
2.48.1

