From 952412a0be1d9b39f12c86f3882cbdac04e9602a Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Tue, 4 Feb 2025 14:03:28 +0530 Subject: [PATCH v4] SVE support for popcount and popcount masked --- config/c-compiler.m4 | 36 ++++++++ configure | 56 ++++++++++++ configure.ac | 9 ++ meson.build | 33 +++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 14 +++ src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_bitutils.c | 10 ++- src/port/pg_popcount_sve.c | 160 +++++++++++++++++++++++++++++++++ 10 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 src/port/pg_popcount_sve.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..c3c2d6fe29 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,39 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_POPCNT_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE popcount instructions using the +# svdup_u64, svwhilelt_b8, svcntb, svaddv, svadd_x, svcnt_x, svld1, +# svptrue_b64 and svand_x intrinsic functions. +# +# If the intrinsics are supported, sets pgac_arm_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_sve_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for svcnt_x and other intrinsics], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf@<:@sizeof(uint64_t)@:>@; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + }], + [return sve_popcount_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index ceeef9b091..4faf7def28 100755 --- a/configure +++ b/configure @@ -17168,6 +17168,62 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x and other intrinsics" >&5 +$as_echo_n "checking for svcnt_x and other intrinsics... " >&6; } +if ${pgac_cv_arm_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + } +int +main () +{ +return sve_popcount_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_popcnt_intrinsics=yes +else + pgac_cv_arm_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi + + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index d713360f34..ba069ebb29 100644 --- a/configure.ac +++ b/configure.ac @@ -2021,6 +2021,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_POPCNT_INTRINSICS() + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM popcount instructions.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 8e128f4982..d3c9a02abc 100644 --- a/meson.build +++ b/meson.build @@ -2194,6 +2194,39 @@ int main(void) endif +############################################################### +# Check for the availability of ARM SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main () +{ + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; +} +''' + + if cc.links(prog, name: 'ARM SVE popcount', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798ab..29c32bbbbe 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 62554ce685..7d771a45dc 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,6 +298,14 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * On AArch64, try using SVE popcount instructions, but only if + * we can verify that the CPU supports it via a runtime check. + */ +#if defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) +#define TRY_POPCNT_FAST 1 +#endif + #ifdef TRY_POPCNT_FAST /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); @@ -315,6 +323,12 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern bool pg_popcount_sve_available(void); +extern uint64 pg_popcount_sve(const char *buf, int bytes); +extern uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c22431951..61a8bcec15 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -45,6 +45,7 @@ OBJS = \ path.o \ pg_bitutils.o \ pg_popcount_avx512.o \ + pg_popcount_sve.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d4..4a3429c21a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,6 +8,7 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcount_avx512.c', + 'pg_popcount_sve.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 5677525693..df7cf429c5 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -135,7 +135,9 @@ pg_popcount_available(void) { unsigned int exx[4] = {0, 0, 0, 0}; -#if defined(HAVE__GET_CPUID) +#if defined(__aarch64__) + return false; /* cpuid not available in __aarch64__ */ +#elif defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) __cpuid(exx, 1); @@ -176,6 +178,12 @@ choose_popcount_functions(void) pg_popcount_optimized = pg_popcount_avx512; pg_popcount_masked_optimized = pg_popcount_masked_avx512; } +#elif USE_SVE_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } #endif } diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c new file mode 100644 index 0000000000..736fdfbf7f --- /dev/null +++ b/src/port/pg_popcount_sve.c @@ -0,0 +1,160 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_sve.c + * Holds the SVE pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_sve.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +#include "port/pg_bitutils.h" +#include + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include +#endif + +/* + * Returns true if the CPU supports the instructions required for the SVE + * pg_popcount() implementation. + */ +bool +pg_popcount_sve_available(void) +{ +#if defined(HAVE_ELF_AUX_INFO) && defined(__aarch64__) /* FreeBSD */ + unsigned long hwcap; + return elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0 && + (hwcap & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) && defined(__aarch64__) /* Linux */ + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +/* + * pg_popcount_sve + * Returns the number of 1-bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svbool_t pred; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + pre_align, + loop_bytes; + uint64 popcnt = 0; + const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf); + + /* + * For smaller inputs, aligning the buffer degrades the performance. + * therefore, align the buffer when the input size is sufficiently large. + */ + if (aligned != buf && bytes > 4 * vec_len) + { + pre_align = aligned + sizeof(uint64_t) - buf; + pred = svwhilelt_b8(0U, pre_align); + popcnt = svaddv(pred, svcnt_x(pred, svld1(pred, (const uint8 *) buf))); + buf += pre_align; + bytes -= pre_align; + } + + pred = svptrue_b64(); + loop_bytes = bytes & ~(vec_len * 2 - 1); + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svld1(pred, (const uint64 *) (buf + i)); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svld1(pred, (const uint64 *) (buf + i + vec_len)); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vector */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + popcnt += svaddv(pred, svcnt_x(pred, svld1(pred, (const uint8 *) (buf + i)))); + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns the number of 1-bits in buf after applying the mask + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + svbool_t pred; + svuint8_t vec8; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + pre_align, + loop_bytes; + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf); + + /* + * For smaller inputs, aligning the buffer degrades the performance. + * therefore, align the buffer when the input size is sufficiently large. + */ + if (aligned != buf && bytes > 4 * vec_len) + { + pre_align = aligned + sizeof(uint64_t) - buf; + pred = svwhilelt_b8(0U, pre_align); + vec8 = svand_x(pred, svld1(pred, (const uint8 *) buf), mask); /* load and mask */ + popcnt = svaddv(pred, svcnt_x(pred, vec8)); + buf += pre_align; + bytes -= pre_align; + } + + pred = svptrue_b64(); + loop_bytes = bytes & ~(vec_len * 2 - 1); + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i)), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i + vec_len)), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vectors */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + vec8 = svand_x(pred, svld1(pred, (const uint8 *) (buf + i)), mask); + popcnt += svaddv(pred, svcnt_x(pred, vec8)); + } + + return popcnt; +} + +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -- 2.34.1