From 8bf013c5d1bf52eab2c877a443324d3e46d18052 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 4 Feb 2025 15:20:13 -0800 Subject: [PATCH v3 3/4] Improve CRC32C performance on SSE4.2 The current SSE4.2 implementation of CRC32C relies on the native crc32 instruction and processes 8 bytes at a time in a loop. The technique in this paper uses the pclmulqdq instruction and processing 64 bytes at time. Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 The algorithm is based on crc32_sse42_simd from chromimum's copy of zlib. See: from https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c Microbenchmarks generated from standalone benchmarks: see https://github.com/r-devulap/crc32c Comparing scalar_crc32c (master) to sse42_crc32c (this branch): |----------------------------------+------------------+---------------+---------------| | Benchmark | buf size (bytes) | Time Old (ns) | Time New (ns) | |----------------------------------+------------------+---------------+---------------| | [scalar_crc32c vs. sse42_crc32c] | 64 | 5 | 4 | | [scalar_crc32c vs. sse42_crc32c] | 128 | 8 | 6 | | [scalar_crc32c vs. sse42_crc32c] | 256 | 19 | 10 | | [scalar_crc32c vs. sse42_crc32c] | 512 | 50 | 18 | | [scalar_crc32c vs. sse42_crc32c] | 1024 | 121 | 34 | | [scalar_crc32c vs. sse42_crc32c] | 2048 | 259 | 70 | |----------------------------------+------------------+---------------+---------------| --- config/c-compiler.m4 | 7 +- configure | 7 +- meson.build | 7 +- src/port/pg_crc32c_sse42.c | 127 +++++++++++++++++++++++++++++- src/port/pg_crc32c_sse42_choose.c | 8 +- 5 files changed, 146 insertions(+), 10 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..8b255b5cc8 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -557,14 +557,19 @@ AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #include #if defined(__has_attribute) && __has_attribute (target) - __attribute__((target("sse4.2"))) + __attribute__((target("sse4.2,pclmul"))) #endif static int crc32_sse42_test(void) + { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; }], diff --git a/configure b/configure index 0ffcaeb436..3f2a2a515e 100755 --- a/configure +++ b/configure @@ -17059,14 +17059,19 @@ else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include + #include #if defined(__has_attribute) && __has_attribute (target) - __attribute__((target("sse4.2"))) + __attribute__((target("sse4.2,pclmul"))) #endif static int crc32_sse42_test(void) + { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; } diff --git a/meson.build b/meson.build index 1ceadb9a83..456c3fafc3 100644 --- a/meson.build +++ b/meson.build @@ -2227,15 +2227,18 @@ if host_cpu == 'x86' or host_cpu == 'x86_64' prog = ''' #include - +#include #if defined(__has_attribute) && __has_attribute (target) -__attribute__((target("sse4.2"))) +__attribute__((target("sse4.2,pclmul"))) #endif int main(void) { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; } diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df3..69f8917c7d 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,13 +15,13 @@ #include "c.h" #include - +#include #include "port/pg_crc32c.h" pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") -pg_crc32c -pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) +static pg_crc32c +pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; @@ -68,3 +68,124 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } + +/* + * Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ + * Instruction" V. Gopal, E. Ozturk, et al., 2009 + * + * The algorithm is based on crc32_sse42_simd from chromimum's copy of zlib. + * See: + * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c + */ + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2,pclmul") +pg_crc32c +pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) +{ + ssize_t len = (ssize_t) length; + const unsigned char *buf = data; + /* + * Definitions of the bit-reflected domain constants k1,k2,k3, etc and + * the CRC32+Barrett polynomials given at the end of the paper. + */ + static const uint64_t pg_attribute_aligned(16) k1k2[] = { 0x740eef02, 0x9e4addf8 }; + static const uint64_t pg_attribute_aligned(16) k3k4[] = { 0xf20c0dfe, 0x14cd00bd6 }; + static const uint64_t pg_attribute_aligned(16) k5k0[] = { 0xdd45aab8, 0x000000000 }; + static const uint64_t pg_attribute_aligned(16) poly[] = { 0x105ec76f1, 0xdea713f1 }; + if (len >= 64) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + /* + * There's at least one block of 64. + */ + x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); + x0 = _mm_load_si128((__m128i *)k1k2); + buf += 64; + len -= 64; + /* + * Parallel fold blocks of 64, if any. + */ + while (len >= 64) + { + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x6 = _mm_clmulepi64_si128(x2, x0, 0x00); + x7 = _mm_clmulepi64_si128(x3, x0, 0x00); + x8 = _mm_clmulepi64_si128(x4, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x2 = _mm_clmulepi64_si128(x2, x0, 0x11); + x3 = _mm_clmulepi64_si128(x3, x0, 0x11); + x4 = _mm_clmulepi64_si128(x4, x0, 0x11); + y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x4 = _mm_xor_si128(x4, x8); + x1 = _mm_xor_si128(x1, y5); + x2 = _mm_xor_si128(x2, y6); + x3 = _mm_xor_si128(x3, y7); + x4 = _mm_xor_si128(x4, y8); + buf += 64; + len -= 64; + } + /* + * Fold into 128-bits. + */ + x0 = _mm_load_si128((__m128i *)k3k4); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x3); + x1 = _mm_xor_si128(x1, x5); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x4); + x1 = _mm_xor_si128(x1, x5); + /* + * Single fold blocks of 16, if any. + */ + while (len >= 16) + { + x2 = _mm_loadu_si128((__m128i *)buf); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + buf += 16; + len -= 16; + } + /* + * Fold 128-bits to 64-bits. + */ + x2 = _mm_clmulepi64_si128(x1, x0, 0x10); + x3 = _mm_setr_epi32(~0, 0, ~0, 0); + x1 = _mm_srli_si128(x1, 8); + x1 = _mm_xor_si128(x1, x2); + x0 = _mm_loadl_epi64((__m128i*)k5k0); + x2 = _mm_srli_si128(x1, 4); + x1 = _mm_and_si128(x1, x3); + x1 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + /* + * Barret reduce to 32-bits. + */ + x0 = _mm_load_si128((__m128i*)poly); + x2 = _mm_and_si128(x1, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x10); + x2 = _mm_and_si128(x2, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + crc = _mm_extract_epi32(x1, 1); + } + + return pg_comp_crc32c_sse42_tail(crc, buf, len); +} diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d424..57e4ad5dfd 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -31,7 +31,7 @@ #include "port/pg_crc32c.h" static bool -pg_crc32c_sse42_available(void) +pg_crc32c_sse42_pclmul_available(void) { unsigned int exx[4] = {0, 0, 0, 0}; @@ -43,7 +43,9 @@ pg_crc32c_sse42_available(void) #error cpuid instruction not available #endif - return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + bool sse42 = (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + bool pclmul = (exx[2] & (1 << 1)) != 0; /* PCLMULQDQ */ + return sse42 && pclmul; } /* @@ -53,7 +55,7 @@ pg_crc32c_sse42_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { - if (pg_crc32c_sse42_available()) + if (pg_crc32c_sse42_pclmul_available()) pg_comp_crc32c = pg_comp_crc32c_sse42; else pg_comp_crc32c = pg_comp_crc32c_sb8; -- 2.43.0