From 1bcf659ae6ea6b864dfc791e516dff0a313837f4 Mon Sep 17 00:00:00 2001 From: "xiang.gao" Date: Wed, 13 Sep 2023 15:13:37 +0800 Subject: [PATCH] PostgreSQL: CRC32C optimization Crc32c Parallel computation optimization Algorithm comes from Intel whitepaper: crc-iscsi-polynomial-crc32-instruction-paper Input data is divided into three equal-sized blocks. Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes. One Block: 42(BLK_LEN) * 8 bytes Crc32c unitest: https://gist.github.com/gaoxyt/138fd53ca1eead8102eeb9204067f7e4 Crc32c benchmark: https://gist.github.com/gaoxyt/4506c10fc06b3501445e32c4257113e9 It gets ~2x speedup compared to linear Arm crc32c instructions. Signed-off-by: xiang.gao Change-Id: If876bbca5bbc3940946a7d72e14fe9fdf54682c1 --- config/c-compiler.m4 | 26 ++++- configure | 155 +++++++++++++++--------------- configure.ac | 77 ++++++++------- meson.build | 35 +++++-- src/include/pg_config.h.in | 8 +- src/include/port/pg_crc32c.h | 17 ++-- src/port/Makefile | 5 - src/port/meson.build | 5 +- src/port/pg_crc32c_armv8.c | 57 ++++++++++- src/port/pg_crc32c_armv8_choose.c | 50 +++++++++- 10 files changed, 288 insertions(+), 147 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5db02b2ab7..c3731cabd6 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -656,12 +656,36 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [Ac_cachevar=no]) CFLAGS="$pgac_save_CFLAGS"]) if test x"$Ac_cachevar" = x"yes"; then - CFLAGS_CRC="$1" pgac_armv8_crc32c_intrinsics=yes fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS +# PGAC_ARMV8_VMULL_INTRINSICS +# ---------------------------- +# Check if the compiler supports the vmull_p64 +# intrinsic functions. These instructions +# were first introduced in ARMv8 crypto Extension. +# +# An optional compiler flag can be passed as argument (e.g. +# -march=armv8-a+crypto). If the intrinsics are supported, sets +# pgac_armv8_vmull_intrinsics, and CFLAGS_VMULL. +AC_DEFUN([PGAC_ARMV8_VMULL_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_armv8_vmull_intrinsics_$1])])dnl +AC_CACHE_CHECK([for vmull_p64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678);])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_armv8_vmull_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARMV8_VMULL_INTRINSICS + # PGAC_LOONGARCH_CRC32C_INTRINSICS # --------------------------- # Check if the compiler supports the LoongArch CRCC instructions, using diff --git a/configure b/configure index cfd968235f..42f0c160ad 100755 --- a/configure +++ b/configure @@ -17909,51 +17909,9 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext # Check for ARMv8 CRC Extension intrinsics to do CRC calculations. # -# First check if __crc32c* intrinsics can be used with the default compiler -# flags. If not, check if adding -march=armv8-a+crc flag helps. -# CFLAGS_CRC is set if the extra flag is required. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=" >&5 -$as_echo_n "checking for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=... " >&6; } -if ${pgac_cv_armv8_crc32c_intrinsics_+:} false; then : - $as_echo_n "(cached) " >&6 -else - pgac_save_CFLAGS=$CFLAGS -CFLAGS="$pgac_save_CFLAGS " -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -int -main () -{ -unsigned int crc = 0; - crc = __crc32cb(crc, 0); - crc = __crc32ch(crc, 0); - crc = __crc32cw(crc, 0); - crc = __crc32cd(crc, 0); - /* return computed value, to prevent the above being optimized away */ - return crc == 0; - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_armv8_crc32c_intrinsics_=yes -else - pgac_cv_armv8_crc32c_intrinsics_=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -CFLAGS="$pgac_save_CFLAGS" -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_armv8_crc32c_intrinsics_" >&5 -$as_echo "$pgac_cv_armv8_crc32c_intrinsics_" >&6; } -if test x"$pgac_cv_armv8_crc32c_intrinsics_" = x"yes"; then - CFLAGS_CRC="" - pgac_armv8_crc32c_intrinsics=yes -fi - -if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=-march=armv8-a+crc" >&5 +# check if __crc32c* intrinsics can be used with the compiler +# flags -march=armv8-a+crc +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=-march=armv8-a+crc" >&5 $as_echo_n "checking for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=-march=armv8-a+crc... " >&6; } if ${pgac_cv_armv8_crc32c_intrinsics__march_armv8_apcrc+:} false; then : $as_echo_n "(cached) " >&6 @@ -17989,11 +17947,9 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_armv8_crc32c_intrinsics__march_armv8_apcrc" >&5 $as_echo "$pgac_cv_armv8_crc32c_intrinsics__march_armv8_apcrc" >&6; } if test x"$pgac_cv_armv8_crc32c_intrinsics__march_armv8_apcrc" = x"yes"; then - CFLAGS_CRC="-march=armv8-a+crc" pgac_armv8_crc32c_intrinsics=yes fi -fi # Check for LoongArch CRC intrinsics to do CRC calculations. # @@ -18038,6 +17994,44 @@ fi +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +# +# Check if vmull_p64 intrinsics can be used with the compiler +# flag -march=armv8-a+crypto. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto" >&5 +$as_echo_n "checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto... " >&6; } +if ${pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=armv8-a+crypto" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=yes +else + pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&5 +$as_echo "$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&6; } +if test x"$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" = x"yes"; then + pgac_armv8_vmull_intrinsics=yes +fi + + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -18057,7 +18051,7 @@ fi # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -18068,27 +18062,29 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C=1 else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 fi fi fi fi fi +# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too. +if test x"$USE_ARMV8_VMULL" = x"" && test x"$USE_ARMV8_CRC32C" = x"1"; then + if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then + USE_ARMV8_VMULL=1 + fi +fi + # Set PG_CRC32C_OBJS appropriately depending on the selected implementation. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5 $as_echo_n "checking which CRC-32C implementation to use... " >&6; } @@ -18112,39 +18108,42 @@ $as_echo "SSE 4.2 with runtime check" >&6; } $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 -$as_echo "ARMv8 CRC instructions" >&6; } + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with a runtime check" >&5 +$as_echo "ARMv8 CRC instructions with a runtime check" >&6; } else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - -$as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 -$as_echo "ARMv8 CRC instructions with runtime check" >&6; } - else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } - else + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } - fi fi fi fi fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use ARM VMULL intrinsic with a runtime check" >&5 +$as_echo_n "checking whether to use ARM VMULL intrinsic with a runtime check... " >&6; } +if test x"$USE_ARMV8_VMULL" = x"1"; then + +$as_echo "#define USE_ARMV8_VMULL 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/configure.ac b/configure.ac index f220b379b3..8e83f86554 100644 --- a/configure.ac +++ b/configure.ac @@ -2091,13 +2091,9 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ # Check for ARMv8 CRC Extension intrinsics to do CRC calculations. # -# First check if __crc32c* intrinsics can be used with the default compiler -# flags. If not, check if adding -march=armv8-a+crc flag helps. -# CFLAGS_CRC is set if the extra flag is required. -PGAC_ARMV8_CRC32C_INTRINSICS([]) -if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then - PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc]) -fi +# check if __crc32c* intrinsics can be used with the compiler +# flags -march=armv8-a+crc +PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc]) # Check for LoongArch CRC intrinsics to do CRC calculations. # @@ -2107,6 +2103,12 @@ PGAC_LOONGARCH_CRC32C_INTRINSICS() AC_SUBST(CFLAGS_CRC) +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +# +# Check if vmull_p64 intrinsics can be used with the compiler +# flag -march=armv8-a+crypto. +PGAC_ARMV8_VMULL_INTRINSICS([-march=armv8-a+crypto]) + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -2126,7 +2128,7 @@ AC_SUBST(CFLAGS_CRC) # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -2137,27 +2139,29 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C=1 else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 fi fi fi fi fi +# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too. +if test x"$USE_ARMV8_VMULL" = x"" && test x"$USE_ARMV8_CRC32C" = x"1"; then + if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then + USE_ARMV8_VMULL=1 + fi +fi + # Set PG_CRC32C_OBJS appropriately depending on the selected implementation. AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then @@ -2171,30 +2175,31 @@ else AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - AC_MSG_RESULT(ARMv8 CRC instructions) + AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + AC_MSG_RESULT(ARMv8 CRC instructions with a runtime check) else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then - AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - AC_MSG_RESULT(LoongArch CRCC instructions) - else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) - fi + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) fi fi fi fi AC_SUBST(PG_CRC32C_OBJS) +AC_MSG_CHECKING([whether to use ARM VMULL intrinsic with a runtime check]) +if test x"$USE_ARMV8_VMULL" = x"1"; then + AC_DEFINE(USE_ARMV8_VMULL, 1, [Define to 1 to use ARMv8 VMULL Extension with a runtime check.]) + AC_MSG_RESULT(yes) +else + AC_MSG_RESULT(no) +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/meson.build b/meson.build index 2d516c8f37..fcc7e401a3 100644 --- a/meson.build +++ b/meson.build @@ -2054,17 +2054,10 @@ int main(void) } ''' - if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc', - args: test_c_args) - # Use ARM CRC Extension unconditionally - cdata.set('USE_ARMV8_CRC32C', 1) - have_optimized_crc = true - elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc', + if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc', args: test_c_args + ['-march=armv8-a+crc']) # Use ARM CRC Extension, with runtime check - cflags_crc += '-march=armv8-a+crc' - cdata.set('USE_ARMV8_CRC32C', false) - cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) + cdata.set('USE_ARMV8_CRC32C', 1) have_optimized_crc = true endif @@ -2101,6 +2094,30 @@ endif +############################################################### +# Check for ARMv8 VMULL intrinsics to do polynomial multiplication +############################################################### + +if (host_cpu == 'arm' or host_cpu == 'aarch64') + + prog = ''' +#include + +int main(void) +{ + return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); +} +''' + + if cc.links(prog, name: 'vmull_p64 with -march=armv8-a+crypto', + args: test_c_args + ['-march=armv8-a+crypto']) + # Use ARM VMULL Extension, with runtime check + cdata.set('USE_ARMV8_VMULL', 1) + endif +endif + + + ############################################################### # Other CPU specific stuff ############################################################### diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8a2985567..6ae160551d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -683,11 +683,11 @@ /* Define to 1 if strerror_r() returns int. */ #undef STRERROR_R_INT -/* Define to 1 to use ARMv8 CRC Extension. */ -#undef USE_ARMV8_CRC32C - /* Define to 1 to use ARMv8 CRC Extension with a runtime check. */ -#undef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK +#undef USE_ARMV8_CRC32C + +/* Define to 1 to use ARMv8 VMULL Extension with a runtime check. */ +#undef USE_ARMV8_VMULL /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index d085f1dc00..c1fe4dc7bd 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -51,12 +51,18 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ - #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); + +#if defined(USE_ARMV8_VMULL) +#include +extern pg_crc32c pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -67,10 +73,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) /* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first + * Use Intel SSE 4.2 instructions, but perform a runtime check first * to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ @@ -83,9 +89,6 @@ extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) #ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); #endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* diff --git a/src/port/Makefile b/src/port/Makefile index f205c2c9c5..e59e097e03 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -89,11 +89,6 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) -# all versions of pg_crc32c_armv8.o need CFLAGS_CRC -pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) -pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) -pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) - # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index a0d0a9583a..0dd794f28c 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -88,9 +88,8 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], - ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], - ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], + ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C'], # loongarch ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index d8fae510cf..5963f112d7 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -2,6 +2,7 @@ * * pg_crc32c_armv8.c * Compute CRC-32C checksum using ARMv8 CRC Extension instructions + * with ARMv8 VMULL Extentsion instructions or not * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -15,11 +16,13 @@ #include "c.h" #include +#include #include "port/pg_crc32c.h" -pg_crc32c -pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) +__attribute__((target("+crc+crypto"))) +static inline pg_crc32c +pg_comp_crc32c_helper(pg_crc32c crc, const void *data, size_t len, bool use_vmull) { const unsigned char *p = data; const unsigned char *pend = p + len; @@ -48,6 +51,42 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) p += 4; } + if (use_vmull) + { +/* + * Crc32c parallel computation Input data is divided into three + * equal-sized blocks. Block length : 42 words(42 * 8 bytes). + * CRC0: 0 ~ 41 * 8, + * CRC1: 42 * 8 ~ (42 * 2 - 1) * 8, + * CRC2: 42 * 2 * 8 ~ (42 * 3 - 1) * 8. + */ + while (p + 1024 <= pend) + { +#define BLOCK_LEN 42 + const uint64_t *in64 = (const uint64_t *) (p); + uint32_t crc0 = crc, + crc1 = 0, + crc2 = 0; + + for (int i = 0; i < BLOCK_LEN; i++, in64++) + { + crc0 = __crc32cd(crc0, *(in64)); + crc1 = __crc32cd(crc1, *(in64 + BLOCK_LEN)); + crc2 = __crc32cd(crc2, *(in64 + BLOCK_LEN * 2)); + } + in64 += BLOCK_LEN * 2; + crc0 = __crc32cd(0, vmull_p64(crc0, 0xcec3662e)); + crc1 = __crc32cd(0, vmull_p64(crc1, 0xa60ce07b)); + crc = crc0 ^ crc1 ^ crc2; + + crc = __crc32cd(crc, *in64++); + crc = __crc32cd(crc, *in64++); + + p += 1024; +#undef BLOCK_LEN + } + } + /* Process eight bytes at a time, as far as we can. */ while (p + 8 <= pend) { @@ -73,3 +112,17 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#if defined(USE_ARMV8_VMULL) +pg_crc32c +pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len) +{ + return pg_comp_crc32c_helper(crc, data, len, true); +} +#endif + +pg_crc32c +pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) +{ + return pg_comp_crc32c_helper(crc, data, len, false); +} diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index 0fdddccaf7..07f1eb4bfe 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -4,8 +4,8 @@ * Choose between ARMv8 and software CRC-32C implementation. * * On first call, checks if the CPU we're running on supports the ARMv8 - * CRC Extension. If it does, use the special instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation + * CRC Extension and VMULL Extension. If it does, use the special instructions + * for CRC-32C computation. Otherwise, fall back to the pure software implementation * (slicing-by-8). * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group @@ -26,6 +26,7 @@ #include #include +#include #include "port/pg_crc32c.h" @@ -77,6 +78,36 @@ pg_crc32c_armv8_available(void) return (result > 0); } +#if defined(USE_ARMV8_VMULL) +__attribute__((target("+crypto"))) +static bool +pg_vmull_armv8_available(void) +{ + int result; + + pqsignal(SIGILL, illegal_instruction_handler); + if (sigsetjmp(illegal_instruction_jump, 1) == 0) + { + result = ((uint64_t) vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678); + } + else + { + /* We got the SIGILL trap */ + result = -1; + } + pqsignal(SIGILL, SIG_DFL); + +#ifndef FRONTEND + /* We don't expect this case, so complain loudly */ + if (result == 0) + elog(ERROR, "vmull_p64 hardware results error"); + + elog(DEBUG1, "using armv8 vmull_p64 hardware = %d", (result > 0)); +#endif + return (result > 0); +} +#endif + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -85,9 +116,24 @@ static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { if (pg_crc32c_armv8_available()) + { +#if defined(USE_ARMV8_VMULL) + if (pg_vmull_armv8_available()) + { + pg_comp_crc32c = pg_comp_crc32c_with_vmull_armv8; + } + else + { + pg_comp_crc32c = pg_comp_crc32c_armv8; + } +#else pg_comp_crc32c = pg_comp_crc32c_armv8; +#endif + } else + { pg_comp_crc32c = pg_comp_crc32c_sb8; + } return pg_comp_crc32c(crc, data, len); } -- 2.34.1