From d41ac63a6f4dc71df5894a31a9d0b0b5572816ae Mon Sep 17 00:00:00 2001 From: Yuqi Gu Date: Mon, 8 Jan 2018 03:03:31 +0000 Subject: [PATCH] Optimize Arm64 crc32c implementation in Postgresql Providing the ARM64v8 crc32 Interfaces to optimize the performance on ARM64 Platform. Change-Id: I3af7e7e6a9f36936e7c16c5863a7c3e87e911cbf Signed-off-by: Yuqi Gu --- config/c-compiler.m4 | 15 ++++++++ configure | 82 ++++++++++++++++++++++++++++++++++++++------ configure.in | 24 +++++++++---- src/include/pg_config.h.in | 3 ++ src/include/port/pg_crc32c.h | 10 +++++- src/port/pg_crc32c_choose.c | 20 +++++++++++ src/port/pg_crc32c_sb8.c | 47 +++++++++++++++++++++++++ 7 files changed, 183 insertions(+), 18 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 076656c..9cd6270 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -595,3 +595,18 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS + +AC_DEFUN([PGAC_ARM64CE_CRC32_INTRINSICS], +[AC_CACHE_CHECK([for Arm64ce CRC32], [pgac_cv_arm64ce_crc32_intrinsics], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [unsigned int arm_flag = 0; +#if defined(__ARM_ARCH) && (__ARM_ARCH > 7) + arm_flag = 1; +#endif + return arm_flag == 1;])], + [pgac_cv_arm64ce_crc32_intrinsics="yes"], + [pgac_cv_arm64ce_crc32_intrinsics="no"])]) +if test x"$pgac_cv_arm64ce_crc32_intrinsics" = x"yes"; then + pgac_arm64ce_crc32_intrinsics=yes +fi +])# PGAC_ARM64CE_CRC32_INTRINSICS diff --git a/configure b/configure index 45221e1..1c7f0b3 100755 --- a/configure +++ b/configure @@ -777,6 +777,7 @@ infodir docdir oldincludedir includedir +runstatedir localstatedir sharedstatedir sysconfdir @@ -904,6 +905,7 @@ datadir='${datarootdir}' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' +runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' @@ -1156,6 +1158,15 @@ do | -silent | --silent | --silen | --sile | --sil) silent=yes ;; + -runstatedir | --runstatedir | --runstatedi | --runstated \ + | --runstate | --runstat | --runsta | --runst | --runs \ + | --run | --ru | --r) + ac_prev=runstatedir ;; + -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ + | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ + | --run=* | --ru=* | --r=*) + runstatedir=$ac_optarg ;; + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ @@ -1293,7 +1304,7 @@ fi for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir + libdir localedir mandir runstatedir do eval ac_val=\$$ac_var # Remove trailing slashes. @@ -1446,6 +1457,7 @@ Fine tuning of the installation directories: --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] @@ -12655,7 +12667,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -12701,7 +12713,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -12725,7 +12737,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -12770,7 +12782,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -12794,7 +12806,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15449,6 +15461,41 @@ if ac_fn_c_try_compile "$LINENO"; then : fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Arm64ce CRC32" >&5 +$as_echo_n "checking for Arm64ce CRC32... " >&6; } +if ${pgac_cv_arm64ce_crc32_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int arm_flag = 0; +#if defined(__ARM_ARCH) && (__ARM_ARCH > 7) + arm_flag = 1; +#endif + return arm_flag == 1; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm64ce_crc32_intrinsics="yes" +else + pgac_cv_arm64ce_crc32_intrinsics="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm64ce_crc32_intrinsics" >&5 +$as_echo "$pgac_cv_arm64ce_crc32_intrinsics" >&6; } +if test x"$pgac_cv_arm64ce_crc32_intrinsics" = x"yes"; then + pgac_arm64ce_crc32_intrinsics=yes +fi + + # Select CRC-32C implementation. # # If we are targeting a processor that has SSE 4.2 instructions, we can use the @@ -15468,9 +15515,13 @@ if test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHEC if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm which doesn't require any special - # CPU support. - USE_SLICING_BY_8_CRC32C=1 + if test x"$pgac_arm64ce_crc32_intrinsics" = x"yes"; then + USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK=1 + else + # fall back to slicing-by-8 algorithm which doesn't require any special + # CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi fi fi fi @@ -15494,12 +15545,21 @@ $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else + if test x"$USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + +$as_echo "#define USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARM64 CE with runtime check" >&5 +$as_echo "ARM64 CE with runtime check" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi diff --git a/configure.in b/configure.in index 4d26034..84ebf53 100644 --- a/configure.in +++ b/configure.in @@ -1900,6 +1900,8 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ #endif ])], [SSE4_2_TARGETED=1]) +PGAC_ARM64CE_CRC32_INTRINSICS + # Select CRC-32C implementation. # # If we are targeting a processor that has SSE 4.2 instructions, we can use the @@ -1919,9 +1921,13 @@ if test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHEC if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm which doesn't require any special - # CPU support. - USE_SLICING_BY_8_CRC32C=1 + if test x"$pgac_arm64ce_crc32_intrinsics" = x"yes"; then + USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK=1 + else + # fall back to slicing-by-8 algorithm which doesn't require any special + # CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi fi fi fi @@ -1938,9 +1944,15 @@ else PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_choose.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM64 CE CRC instructions with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_choose.o" + AC_MSG_RESULT(ARM64 CE with runtime check) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi AC_SUBST(PG_CRC32C_OBJS) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f98f773..ae2cdf1 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -871,6 +871,9 @@ /* Define to 1 to use Intel SSSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use ARM64 CRC instructions with a runtime check. */ +#undef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK + /* Define to build with systemd support. (--with-systemd) */ #undef USE_SYSTEMD diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index ae2701e..50405a5 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -49,7 +49,8 @@ typedef uint32 pg_crc32c; extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK) + /* * Use SSE4.2 instructions, but perform a runtime check first to check that * they are available. @@ -62,6 +63,13 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +/* Correspondence with pg_com_crc32c_sb8 + * Arm64 using Castagnoli polynomial 0x1EDC6F41: crc32c + */ +#ifdef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_arm64(pg_crc32c crc, const void *data, size_t len); +#endif + #else /* * Use slicing-by-8 algorithm. diff --git a/src/port/pg_crc32c_choose.c b/src/port/pg_crc32c_choose.c index 40bee67..d3682ad 100644 --- a/src/port/pg_crc32c_choose.c +++ b/src/port/pg_crc32c_choose.c @@ -29,6 +29,20 @@ #include "port/pg_crc32c.h" +#ifdef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK +#include +#include +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif + +static bool +pg_crc32c_arm64ce_available(void) { + unsigned long auxv = getauxval(AT_HWCAP); + return (auxv & HWCAP_CRC32) != 0; +} + +#else static bool pg_crc32c_sse42_available(void) { @@ -44,6 +58,7 @@ pg_crc32c_sse42_available(void) return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ } +#endif /* * This gets called on the first call. It replaces the function pointer @@ -52,8 +67,13 @@ pg_crc32c_sse42_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { +#if defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK) + if (pg_crc32c_arm64ce_available()) + pg_comp_crc32c = pg_comp_crc32c_arm64; +#else if (pg_crc32c_sse42_available()) pg_comp_crc32c = pg_comp_crc32c_sse42; +#endif else pg_comp_crc32c = pg_comp_crc32c_sb8; diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c index 5205ba9..fd9dd93 100644 --- a/src/port/pg_crc32c_sb8.c +++ b/src/port/pg_crc32c_sb8.c @@ -22,6 +22,53 @@ #include "port/pg_crc32c.h" +#if defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK) +asm(".arch_extension crc"); +#define LDP(x,y,p) asm("ldp %x[a], %x[b], [%x[c]], #16" : [a]"=r"(x),[b]"=r"(y),[c]"+r"(p)) +/* CRC32C: Castagnoli polynomial 0x1EDC6F41 */ +#define CRC32CX(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CW(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CH(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) +#define CRC32CB(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value)) + +pg_crc32c +pg_comp_crc32c_arm64(pg_crc32c crc, const void* data, size_t len) { + uint64 p0, p1; + pg_crc32c crc32_c = crc; + long length = len; + const unsigned char *p_buf = data; + + /* Allow crc instructions in asm */ + asm(".cpu generic+crc"); + while ((length -= 2*sizeof(uint64)) >= 0) { + LDP(p0, p1, p_buf); + CRC32CX(crc32_c,p0); + CRC32CX(crc32_c,p1); + } + + if (length & sizeof(uint64)) { + CRC32CX(crc32_c, *(uint64*)p_buf); + p_buf += sizeof(uint64); + } + + if (length & sizeof(uint32)) { + CRC32CW(crc32_c, *(uint64*)p_buf); + p_buf += sizeof(uint32); + } + + if (length & sizeof(uint16)) { + CRC32CH(crc32_c, *(uint16*)p_buf); + p_buf += sizeof(uint16); + } + + if (length & sizeof(uint8)) { + CRC32CB(crc32_c, *p_buf); + } + + return crc32_c; +} +#endif + static const uint32 pg_crc32c_table[8][256]; /* Accumulate one input byte */ -- 2.7.4