From f8d60abb5851ea713561da1ccababc8f94206ee3 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 9 Feb 2025 12:25:56 +0700 Subject: [PATCH v3 4/4] Shorter version from corsix --- src/port/pg_crc32c_sse42.c | 165 ++++++++++++++----------------------- 1 file changed, 62 insertions(+), 103 deletions(-) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 69f8917c7d..dec685d139 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -78,114 +78,73 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c */ +#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) +#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2,pclmul") pg_crc32c -pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) +pg_comp_crc32c_sse42(pg_crc32c crc0, const void *data, size_t length) { - ssize_t len = (ssize_t) length; + size_t len = length; const unsigned char *buf = data; - /* - * Definitions of the bit-reflected domain constants k1,k2,k3, etc and - * the CRC32+Barrett polynomials given at the end of the paper. - */ - static const uint64_t pg_attribute_aligned(16) k1k2[] = { 0x740eef02, 0x9e4addf8 }; - static const uint64_t pg_attribute_aligned(16) k3k4[] = { 0xf20c0dfe, 0x14cd00bd6 }; - static const uint64_t pg_attribute_aligned(16) k5k0[] = { 0xdd45aab8, 0x000000000 }; - static const uint64_t pg_attribute_aligned(16) poly[] = { 0x105ec76f1, 0xdea713f1 }; - if (len >= 64) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; - /* - * There's at least one block of 64. - */ - x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); - x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); - x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); - x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); - x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); - x0 = _mm_load_si128((__m128i *)k1k2); - buf += 64; - len -= 64; - /* - * Parallel fold blocks of 64, if any. - */ - while (len >= 64) - { - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x6 = _mm_clmulepi64_si128(x2, x0, 0x00); - x7 = _mm_clmulepi64_si128(x3, x0, 0x00); - x8 = _mm_clmulepi64_si128(x4, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x2 = _mm_clmulepi64_si128(x2, x0, 0x11); - x3 = _mm_clmulepi64_si128(x3, x0, 0x11); - x4 = _mm_clmulepi64_si128(x4, x0, 0x11); - y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); - y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); - y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); - y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_xor_si128(x2, x6); - x3 = _mm_xor_si128(x3, x7); - x4 = _mm_xor_si128(x4, x8); - x1 = _mm_xor_si128(x1, y5); - x2 = _mm_xor_si128(x2, y6); - x3 = _mm_xor_si128(x3, y7); - x4 = _mm_xor_si128(x4, y8); - buf += 64; - len -= 64; - } - /* - * Fold into 128-bits. - */ - x0 = _mm_load_si128((__m128i *)k3k4); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x2); - x1 = _mm_xor_si128(x1, x5); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x3); - x1 = _mm_xor_si128(x1, x5); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x4); - x1 = _mm_xor_si128(x1, x5); - /* - * Single fold blocks of 16, if any. - */ - while (len >= 16) - { - x2 = _mm_loadu_si128((__m128i *)buf); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x2); - x1 = _mm_xor_si128(x1, x5); - buf += 16; - len -= 16; - } - /* - * Fold 128-bits to 64-bits. - */ - x2 = _mm_clmulepi64_si128(x1, x0, 0x10); - x3 = _mm_setr_epi32(~0, 0, ~0, 0); - x1 = _mm_srli_si128(x1, 8); - x1 = _mm_xor_si128(x1, x2); - x0 = _mm_loadl_epi64((__m128i*)k5k0); - x2 = _mm_srli_si128(x1, 4); - x1 = _mm_and_si128(x1, x3); - x1 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_xor_si128(x1, x2); - /* - * Barret reduce to 32-bits. - */ - x0 = _mm_load_si128((__m128i*)poly); - x2 = _mm_and_si128(x1, x3); - x2 = _mm_clmulepi64_si128(x2, x0, 0x10); - x2 = _mm_and_si128(x2, x3); - x2 = _mm_clmulepi64_si128(x2, x0, 0x00); - x1 = _mm_xor_si128(x1, x2); - crc = _mm_extract_epi32(x1, 1); + + if (len >= 64) { + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; + __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1; + __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2; + __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3; + __m128i k; + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + len -= 64; + /* Main loop. */ + while (len >= 64) { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + len -= 64; + } + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + } + if (len >= 16) { + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; + __m128i k; + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 16; + len -= 16; + /* Main loop. */ + while (len >= 16) { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0); + buf += 16; + len -= 16; } + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + } - return pg_comp_crc32c_sse42_tail(crc, buf, len); + return pg_comp_crc32c_sse42_tail(crc0, buf, len); } -- 2.43.0