From 57952d1f89f0c3a4a2d28399344e9335f8bee72b Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v4 2/5] Vendor SSE implementation from
 https://github.com/corsix/fast-crc32/

---
 src/port/pg_crc32c_sse42.c | 77 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df3..6cc39de175 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -68,3 +68,80 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4 */
+/* MIT licensed */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+
+#if defined(_MSC_VER)
+#define CRC_AINLINE static __forceinline
+#define CRC_ALIGN(n) __declspec(align(n))
+#else
+#define CRC_AINLINE static __inline __attribute__((always_inline))
+#define CRC_ALIGN(n) __attribute__((aligned(n)))
+#endif
+#define CRC_EXPORT extern
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
+  crc0 = ~crc0;
+  for (; len && ((uintptr_t)buf & 7); --len) {
+    crc0 = _mm_crc32_u8(crc0, *buf++);
+  }
+  if (((uintptr_t)buf & 8) && len >= 8) {
+    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
+    buf += 8;
+    len -= 8;
+  }
+  if (len >= 64) {
+    /* First vector chunk. */
+    __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
+    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
+    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
+    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
+    __m128i k;
+    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+    buf += 64;
+    len -= 64;
+    /* Main loop. */
+    while (len >= 64) {
+      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
+      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
+      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
+      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
+      buf += 64;
+      len -= 64;
+    }
+    /* Reduce x0 ... x3 to just x0. */
+    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+  }
+  for (; len >= 8; buf += 8, len -= 8) {
+    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
+  }
+  for (; len; --len) {
+    crc0 = _mm_crc32_u8(crc0, *buf++);
+  }
+  return ~crc0;
+}
-- 
2.48.1

