From 0b8f9851f38444dbe29009120d98cd38e93efe7f Mon Sep 17 00:00:00 2001 From: Paul Amonson Date: Tue, 21 May 2024 13:23:39 -0700 Subject: [PATCH] [Feat] Add-AVX512 crc32c algorithm to postgres Signed-off-by: Paul Amonson --- config/c-compiler.m4 | 48 +++++++ configure | 223 +++++++++++++++++++++++------ configure.ac | 106 +++++++++----- meson.build | 41 +++++- src/include/pg_config.h.in | 3 + src/include/port/pg_crc32c.h | 24 +++- src/port/Makefile | 10 ++ src/port/meson.build | 4 + src/port/pg_crc32c_avx512.c | 222 ++++++++++++++++++++++++++++ src/port/pg_crc32c_avx512_choose.c | 202 ++++++++++++++++++++++++++ 10 files changed, 797 insertions(+), 86 deletions(-) create mode 100644 src/port/pg_crc32c_avx512.c create mode 100644 src/port/pg_crc32c_avx512_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 10f8c7bd0a..1d33932cb5 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -628,6 +628,54 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX512_CRC32_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86 CRC instructions added in AVX-512, +# using the intrinsic functions: + +# (We don't test the 8-byte variant, _mm_crc32_u64, but it is assumed to +# be present if the other ones are, on x86-64 platforms) +# +# An optional compiler flag can be passed as arguments (e.g. -msse4.2 +# -mavx512vl -mvpclmulqdq). If the intrinsics are supported, sets +# pgac_avx512_crc32_intrinsics, and CFLAGS_CRC. +AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [const unsigned long k1k2[[8]] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + unsigned char buffer[[512]]; + unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L); + unsigned long val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + unsigned int crc = 0xffffffff; + y8 = _mm512_load_si512((__m512i *)aligned); + x0 = _mm512_loadu_si512((__m512i *)k1k2); + x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00)); + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + a1 = _mm512_extracti32x4_epi32(x1, 3); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + return crc != 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1" + pgac_avx512_crc32_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_CRC32_INTRINSICS + # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- diff --git a/configure b/configure index 7b03db56a6..45cd755867 100755 --- a/configure +++ b/configure @@ -14898,7 +14898,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -14944,7 +14944,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -14968,7 +14968,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15013,7 +15013,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15037,7 +15037,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -17774,6 +17774,123 @@ fi fi +# Check for Intel AVX-512 intrinsics to do CRC calculations. +# +# First check if the _mm512_clmulepi64_epi128 and more intrinsics can +# be used with the default compiler flags. If not, check if adding +# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC +# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_crc32_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +const unsigned long k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + unsigned char buffer[512]; + unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L); + unsigned long val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + unsigned int crc = 0xffffffff; + y8 = _mm512_load_si512((__m512i *)aligned); + x0 = _mm512_loadu_si512((__m512i *)k1k2); + x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00)); + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + a1 = _mm512_extracti32x4_epi32(x1, 3); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + return crc != 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_crc32_intrinsics_=yes +else + pgac_cv_avx512_crc32_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_crc32_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_crc32_intrinsics_" = x"yes"; then + CFLAGS_CRC="" + pgac_avx512_crc32_intrinsics=yes +fi + +if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq... " >&6; } +if ${pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -msse4.2 -mavx512vl -mvpclmulqdq" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +const unsigned long k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + unsigned char buffer[512]; + unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L); + unsigned long val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + unsigned int crc = 0xffffffff; + y8 = _mm512_load_si512((__m512i *)aligned); + x0 = _mm512_loadu_si512((__m512i *)k1k2); + x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00)); + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + a1 = _mm512_extracti32x4_epi32(x1, 3); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + return crc != 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=yes +else + pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&5 +$as_echo "$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&6; } +if test x"$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" = x"yes"; then + CFLAGS_CRC="-msse4.2 -mavx512vl -mvpclmulqdq" + pgac_avx512_crc32_intrinsics=yes +fi + +fi + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -17946,31 +18063,42 @@ fi # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then + # Use Intel AVX 512 if available. + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then + USE_AVX512_CRC32C=1 else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 + # Use Intel SSE 4.2 if available. + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_CRC32C=1 else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 + # Intel AVX 512, with runtime check? The CPUID instruction is needed for + # the runtime check. + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1 else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for + # the runtime check. + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + USE_ARMV8_CRC32C=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then + USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + else + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -17989,44 +18117,53 @@ $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + +$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX 512 with runtime check" >&5 +$as_echo "AVX 512 with runtime check" >&6; } + else + if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } - else - if test x"$USE_ARMV8_CRC32C" = x"1"; then + else + if test x"$USE_ARMV8_CRC32C" = x"1"; then $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } - else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + else + if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } - else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + else + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } - else + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi fi diff --git a/configure.ac b/configure.ac index 63e7be3847..73ea4d95dd 100644 --- a/configure.ac +++ b/configure.ac @@ -2124,6 +2124,17 @@ if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then PGAC_SSE42_CRC32_INTRINSICS([-msse4.2]) fi +# Check for Intel AVX-512 intrinsics to do CRC calculations. +# +# First check if the _mm512_clmulepi64_epi128 and more intrinsics can +# be used with the default compiler flags. If not, check if adding +# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC +# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required. +PGAC_AVX512_CRC32_INTRINSICS([]) +if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then + PGAC_AVX512_CRC32_INTRINSICS([-msse4.2 -mavx512vl -mvpclmulqdq]) +fi + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ @@ -2169,31 +2180,42 @@ AC_SUBST(CFLAGS_CRC) # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then + # Use Intel AVX 512 if available. + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then + USE_AVX512_CRC32C=1 else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 + # Use Intel SSE 4.2 if available. + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_CRC32C=1 else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 + # Intel AVX 512, with runtime check? The CPUID instruction is needed for + # the runtime check. + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1 else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for + # the runtime check. + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + USE_ARMV8_CRC32C=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then + USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + else + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2208,29 +2230,35 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then PG_CRC32C_OBJS="pg_crc32c_sse42.o" AC_MSG_RESULT(SSE 4.2) else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2 with runtime check) + if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o" + AC_MSG_RESULT(AVX 512 with runtime check) else - if test x"$USE_ARMV8_CRC32C" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - AC_MSG_RESULT(ARMv8 CRC instructions) + if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + AC_MSG_RESULT(SSE 4.2 with runtime check) else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) + if test x"$USE_ARMV8_CRC32C" = x"1"; then + AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + AC_MSG_RESULT(ARMv8 CRC instructions) else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then - AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - AC_MSG_RESULT(LoongArch CRCC instructions) + if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi fi diff --git a/meson.build b/meson.build index f9279c837d..a2b087d561 100644 --- a/meson.build +++ b/meson.build @@ -2144,6 +2144,34 @@ if host_cpu == 'x86' or host_cpu == 'x86_64' cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true else + avx_prog = ''' +#include + +int main(void) +{ + const unsigned long k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + unsigned char buffer[512]; + unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L); + unsigned long val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + unsigned int crc = 0xffffffff; + y8 = _mm512_load_si512((__m512i *)aligned); + x0 = _mm512_loadu_si512((__m512i *)k1k2); + x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00)); + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + a1 = _mm512_extracti32x4_epi32(x1, 3); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + return crc != 0; +} +''' prog = ''' #include @@ -2157,13 +2185,20 @@ int main(void) return crc == 0; } ''' - - if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2', + if cc.links(avx_prog, + name: '_mm512_clmulepi64_epi128 ... with -msse4.2 -mavx512vl -mvpclmulqdq', + args: test_c_args + ['-msse4.2', '-mavx512vl', '-mvpclmulqdq']) + cflags_crc += ['-msse4.2','-mavx512vl','-mvpclmulqdq'] + cdata.set('USE_AVX512_CRC32C', false) + cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1) + have_optimized_crc = true + endif + if have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2', args: test_c_args) # Use Intel SSE 4.2 unconditionally. cdata.set('USE_SSE42_CRC32C', 1) have_optimized_crc = true - elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2', + elif have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2', args: test_c_args + ['-msse4.2']) # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for # the runtime check. diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f8d3e3b6b8..6e08f1c6c7 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -738,6 +738,9 @@ /* Define to 1 use Intel SSE 4.2 CRC instructions. */ #undef USE_SSE42_CRC32C +/* Define to 1 to use Intel AVX 512 CRC instructions with a runtime check. */ +#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 63c8e3a00b..b632ac7d59 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -49,6 +49,14 @@ typedef uint32 pg_crc32c; extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#elif defined (USE_AVX512_CRC32) +/* Use Intel AVX512 instructions. */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_avx512((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ @@ -67,6 +75,21 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK) + +/* + * Use Intel AVX-512 instructions, but perform a runtime check first to check that + * they are available. + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c)(pg_crc32c crc, const void *data, size_t len); + +extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* @@ -86,7 +109,6 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le #ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); #endif - #else /* * Use slicing-by-8 algorithm. diff --git a/src/port/Makefile b/src/port/Makefile index db7c02117b..7ae632c6fc 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -88,11 +88,21 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_crc32c_avx512.o need CFLAGS_CRC +pg_crc32c_avx512.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_avx512_shlib.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_avx512_srv.o: CFLAGS+=$(CFLAGS_CRC) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_crc32c_avx512_choose.o need CFLAGS_XSAVE +pg_crc32c_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_crc32c_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_crc32c_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE) + # all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) diff --git a/src/port/meson.build b/src/port/meson.build index fd9ee199d1..d635913e9b 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -83,6 +83,10 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_avx512', 'USE_AVX512_CRC32C'], + ['pg_crc32c_avx512', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_avx512_choose', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 'xsave'], + ['pg_crc32c_sb8', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'], diff --git a/src/port/pg_crc32c_avx512.c b/src/port/pg_crc32c_avx512.c new file mode 100644 index 0000000000..085c8d99a8 --- /dev/null +++ b/src/port/pg_crc32c_avx512.c @@ -0,0 +1,222 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_avx512.c + * Compute CRC-32C checksum using Intel AVX-512 instructions. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2024, Intel(r) Corporation + * + * IDENTIFICATION + * src/port/pg_crc32c_avx512.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include + +#include "port/pg_crc32c.h" + +/* + * Process eight bytes of data at a time. + * + * NB: We do unaligned accesses here. The Intel architecture allows that, + * and performance testing didn't show any performance gain from aligning + * the begin address. + */ +pg_attribute_no_sanitize_alignment() +inline +static +pg_crc32c +crc32c_fallback(pg_crc32c crc, const uint8 *p, size_t length) +{ + const unsigned char *pend = p + length; + + /* + * Process eight bytes of data at a time. + * + * NB: We do unaligned accesses here. The Intel architecture allows that, + * and performance testing didn't show any performance gain from aligning + * the begin address. + */ + while (p + 8 <= pend) + { + crc = (uint32)_mm_crc32_u64(crc, *((const uint64 *)p)); + p += 8; + } + + /* Process remaining full four bytes if any */ + if (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *)p)); + p += 4; + } + + /* Process any remaining bytes one at a time. */ + while (p < pend) + { + crc = _mm_crc32_u8(crc, *p); + p++; + } + + return crc; +} + +/******************************************************************* + * pg_crc32c_avx512(): compute the crc32c of the buffer, where the + * buffer length must be at least 256, and a multiple of 64. Based + * on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ + * Instruction" + * V. Gopal, E. Ozturk, et al., 2009, + * https://www.researchgate.net/publication/263424619_Fast_CRC_computation#full-text + * + * This Function: + * Copyright 2017 The Chromium Authors + * Copyright (c) 2024, Intel(r) Corporation + * + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/LICENSE + */ +pg_attribute_no_sanitize_alignment() +inline +pg_crc32c +pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length) +{ + static const uint64 k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, + 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + static const uint64 k3k4[8] = { + 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, + 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static const uint64 k9k10[8] = { + 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, + 0x0d3b6092, 0x6992cea2, 0x0d3b6092}; + static const uint64 k1k4[8] = { + 0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, + 0x493c7d27, 0x00000000, 0x00000000}; + + const uint8 *input = (const uint8 *)data; + if (length >= 256) + { + uint64 val; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1, a2; + + /* + * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned + * to 32 bytes. + * >>> BEGIN + */ + /* + * There's at least one block of 256. + */ + x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + + x0 = _mm512_load_si512((__m512i *)k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) + { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96); + x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96); + x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96); + + input += 256; + length -= 256; + } + + /* + * Fold 256 bytes into 64 bytes. + */ + x0 = _mm512_load_si512((__m512i *)k9k10); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96); + + x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96); + + x0 = _mm512_load_si512((__m512i *)k3k4); + y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) + { + x2 = _mm512_loadu_si512((__m512i *)input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 128-bits. + */ + x0 = _mm512_loadu_si512((__m512i *)k1k4); + + a2 = _mm512_extracti32x4_epi32(x1, 3); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96); + + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + x0 = _mm512_xor_epi64(x1, x0); + a1 = _mm512_extracti32x4_epi32(x0, 1); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + + /* + * Fold 128-bits to 32-bits. + */ + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + crc = (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + /* + * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned + * to 32 bytes. + * <<< END + ******************************************************************/ + } + + /* + * Finish any remaining bytes. + */ + return crc32c_fallback(crc, input, length); +} diff --git a/src/port/pg_crc32c_avx512_choose.c b/src/port/pg_crc32c_avx512_choose.c new file mode 100644 index 0000000000..d5ccb69d10 --- /dev/null +++ b/src/port/pg_crc32c_avx512_choose.c @@ -0,0 +1,202 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_avx512_choose.c + * Choose between Intel AVX-512 and software CRC-32C implementation. + * + * On first call, checks if the CPU we're running on supports Intel AVX- + * 512. If it does, use the special AVX-512 instructions for CRC-32C + * computation. Otherwise, fall back to the pure software implementation + * (slicing-by-8). + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2024, Intel(r) Corp. + * + * + * IDENTIFICATION + * src/port/pg_crc32c_avx512_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include +#endif + +#ifdef HAVE_XSAVE_INTRINSICS +#include +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include +#endif + +#include "port/pg_crc32c.h" + +typedef unsigned int exx_t; + +/* + * Helper function. + * Test for a bit being set in a exx_t field. + */ +inline +static +bool +is_bit_set(exx_t reg, int bit) +{ + return (reg & (1 << bit)) != 0; +} + +/* + * Intel Platform CPUID check for Linux and Visual Studio platforms. + */ +inline +static +void +pg_getcpuid(unsigned int leaf, exx_t *exx) +{ +#if defined(HAVE__GET_CPUID) + __get_cpuid(leaf, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif +} + +/* + * Intel Platform CPUIDEX check for Linux and Visual Studio platforms. + */ +inline +static +void +pg_getcpuidex(unsigned int leaf, unsigned int subleaf, exx_t *exx) +{ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(leaf, subleaf, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif +} + +/* + * Check for CPU supprt for CPUID: sse4.2 + */ +inline +static +bool +sse42_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuid(1, exx); + return is_bit_set(exx[2], 20); /* sse4.2 */ +} + +/* + * Check for CPU supprt for CPUID: osxsave + */ +inline +static +bool +osxsave_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuid(1, exx); + return is_bit_set(exx[2], 27); /* osxsave */ +} + +/* + * Check for CPU supprt for CPUIDEX: avx512-f + */ +inline +static +bool +avx512f_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set(exx[1], 16); /* avx512-f */ +} + +/* + * Check for CPU supprt for CPUIDEX: vpclmulqdq + */ +inline +static +bool +vpclmulqdq_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set(exx[1], 10); /* vpclmulqdq */ +} + +/* + * Check for CPU supprt for CPUIDEX: vpclmulqdq + */ +inline +static +bool +avx512vl_available(void) +{ + exx_t exx[4] = {0, 0, 0, 0}; + + pg_getcpuidex(7, 0, exx); + return is_bit_set(exx[1], 31); /* avx512-vl */ +} + +/* + * Does XGETBV say the ZMM registers are enabled? + * + * NB: Caller is responsible for verifying that xsave_available() returns true + * before calling this. + */ +static inline bool +zmm_regs_available(void) +{ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe6) == 0xe6; +#else + return false; +#endif +} + +/* + * Returns true if the CPU supports the instructions required for the AVX-512 + * pg_crc32c implementation. + */ +inline +static +bool +pg_crc32c_avx512_available(void) +{ + return sse42_available() && osxsave_available() && + avx512f_available() && vpclmulqdq_available() && + avx512vl_available() && zmm_regs_available(); +} + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static +pg_crc32c +pg_comp_avx512_choose(pg_crc32c crc, const void *data, size_t len) +{ + if (pg_crc32c_avx512_available()) + pg_comp_crc32c = pg_comp_crc32c_avx512; + else + pg_comp_crc32c = pg_comp_crc32c_sb8; + + return pg_comp_crc32c(crc, data, len); +} + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_avx512_choose; -- 2.34.1