From 309d122c770c967445bc9a0f5527ab6794bd5ba3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v7 3/3] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: John Naylor <johncnaylorls@gmail.com>
Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
---
 src/include/port/pg_cpu.h             |   3 +-
 src/include/port/pg_crc32c.h          |   1 +
 src/port/pg_cpu.c                     |   3 +
 src/port/pg_crc32c_sse42.c            | 101 ++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42_choose.c     |  16 ++++
 src/test/regress/expected/strings.out |  24 ++++++
 src/test/regress/sql/strings.sql      |   4 +
 7 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index 45ce9d3c505..445f0031eb2 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -16,8 +16,9 @@
 
 #define PGCPUCAP_INIT           (1 << 0)
 #define PGCPUCAP_CRC32C         (1 << 1)
+#define PGCPUCAP_CLMUL          (1 << 2)
 
-extern uint32 pg_cpucap;
+extern PGDLLIMPORT uint32 pg_cpucap;
 extern void pg_cpucap_initialize(void);
 
 #endif							/* PG_CPU_H */
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index db155d690e3..068a6536059 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,7 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len)
 #endif
 
 extern bool pg_crc32c_sse42_available(void);
+extern bool pg_crc32c_pclmul_available(void);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
 #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
diff --git a/src/port/pg_cpu.c b/src/port/pg_cpu.c
index c9483357436..52944b2d4ed 100644
--- a/src/port/pg_cpu.c
+++ b/src/port/pg_cpu.c
@@ -31,6 +31,9 @@ pg_cpucap_crc32c(void)
 	if (pg_crc32c_sse42_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
 
+	if (pg_crc32c_pclmul_available())
+		pg_cpucap |= PGCPUCAP_CLMUL;
+
 #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 	if (pg_crc32c_armv8_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..ce2acf1eec1 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,9 +15,94 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 
+/* WIP: configure checks */
+#ifdef __x86_64__
+#define HAVE_PCLMUL_RUNTIME
+#endif
+
+ /*
+  * WIP: Testing has shown that on Kaby Lake (2016) this algorithm needs two
+  * iterations of the main loop to be faster than using regular CRC
+  * instrutions, but Tiger Lake (2020) is fine with a single iteration. Could
+  * use more testing between those years (on AMD as well).
+  */
+#define PCLMUL_THRESHOLD 128
+
+#ifdef HAVE_PCLMUL_RUNTIME
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4 */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+pg_attribute_target("sse4.2,pclmul")
+static pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const unsigned char *buf = data;
+
+	if (len >= 64)
+	{
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		len -= 64;
+
+		/* Main loop. */
+		while (len >= 64)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+			len -= 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+	}
+
+	return crc0;
+}
+
+#endif
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
@@ -26,6 +111,19 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
 
+	/* XXX not for commit */
+	const pg_crc32c orig_crc PG_USED_FOR_ASSERTS_ONLY = crc;
+	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+
+#ifdef HAVE_PCLMUL_RUNTIME
+	if (len >= PCLMUL_THRESHOLD && (pg_cpucap & PGCPUCAP_CLMUL))
+	{
+		crc = pg_comp_crc32c_pclmul(crc, data, len);
+		len %= 64;
+		p = pend - len;
+	}
+#endif
+
 	/*
 	 * Process eight bytes of data at a time.
 	 *
@@ -66,5 +164,8 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 		p++;
 	}
 
+	/* XXX not for commit */
+	Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, orig_len));
+
 	return crc;
 }
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index f4d3215bc55..59a5a31c006 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -40,3 +40,19 @@ pg_crc32c_sse42_available(void)
 
 	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
+
+bool
+pg_crc32c_pclmul_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 1)) != 0;	/* PCLMUL */
+}
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index b65bb2d5368..662bd37ace6 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2282,6 +2282,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
  419469235
 (1 row)
 
+SELECT crc32c(repeat('A', 80)::bytea);
+   crc32c   
+------------
+ 3799127650
+(1 row)
+
+SELECT crc32c(repeat('A', 127)::bytea);
+  crc32c   
+-----------
+ 291820082
+(1 row)
+
+SELECT crc32c(repeat('A', 128)::bytea);
+  crc32c   
+-----------
+ 816091258
+(1 row)
+
+SELECT crc32c(repeat('A', 129)::bytea);
+   crc32c   
+------------
+ 4213642571
+(1 row)
+
 --
 -- encode/decode
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 8e0f3a0e75f..26f86dc92e0 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -727,6 +727,10 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
 
 SELECT crc32c('');
 SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+SELECT crc32c(repeat('A', 80)::bytea);
+SELECT crc32c(repeat('A', 127)::bytea);
+SELECT crc32c(repeat('A', 128)::bytea);
+SELECT crc32c(repeat('A', 129)::bytea);
 
 --
 -- encode/decode
-- 
2.48.1

