From 9ce09b6abaf8e1241e536edfd863ad6dc1f85929 Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Thu, 9 Jan 2025 14:13:22 +0530 Subject: [PATCH v3] SVE support for popcount and popcount masked --- config/c-compiler.m4 | 42 ++++++++++ configure | 50 +++++++++++ configure.ac | 9 ++ meson.build | 28 +++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 14 ++++ src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_bitutils.c | 10 ++- src/port/pg_popcount_sve.c | 149 +++++++++++++++++++++++++++++++++ 10 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 src/port/pg_popcount_sve.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..6c86811e8c 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,45 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_POPCNT_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE popcount instructions using the +# svdup_u64, svptrue_b64, svcnt_z, svcnt_x, svadd_x, svaddv, and svwhilelt_b8 +# intrinsic functions. +# +# If the intrinsics are supported, sets pgac_arm_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_POPCNT_INTRINSICS], +[ + AC_CACHE_CHECK([for svdup_u64 and other intrinsics with CFLAGS=$1], + [pgac_cv_arm_sve_popcnt_intrinsics], + [ + pgac_save_CFLAGS=$CFLAGS + CFLAGS="$pgac_save_CFLAGS $1" + + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + [svbool_t predicate = svptrue_b64(); + svuint64_t segment = svdup_u64(0), accum = svdup_u64(0); + const char *buf = NULL; + uint32_t num_vals_segment = svlen_u64(segment); + + predicate = svwhilelt_b8(0, 128); + segment = svld1(predicate, (const uint64_t *)buf); + accum = svadd_x(predicate, accum, svcnt_x(predicate, segment)); + uint64_t popcnt = svaddv(predicate, accum); + + /* Return computed value, to prevent the above being optimized away */ + return popcnt;])], + [pgac_cv_arm_sve_popcnt_intrinsics=yes], + [pgac_cv_arm_sve_popcnt_intrinsics=no]) + + CFLAGS="$pgac_save_CFLAGS" + ]) + + if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes + fi +]) diff --git a/configure b/configure index a0b5e10ca3..e8ac7b299f 100755 --- a/configure +++ b/configure @@ -17159,6 +17159,56 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SVE intrinsic svcnt_u64" >&5 +$as_echo_n "checking for SVE intrinsic svcnt_u64... " >&6; } +if ${pgac_cv_arm_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS + CFLAGS="$pgac_save_CFLAGS " + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int +main () +{ + svbool_t predicate = svptrue_b64(); + svuint64_t segment, accum = svdup_u64(0); + uint64_t numVals = svlen_u64(segment); + + svuint64_t counts = svcnt_u64_z(predicate, segment); + accum = svadd_u64_m(predicate, accum, counts); + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_popcnt_intrinsics=yes +else + pgac_cv_arm_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi + +if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + $as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + +fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index d713360f34..ba069ebb29 100644 --- a/configure.ac +++ b/configure.ac @@ -2021,6 +2021,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_POPCNT_INTRINSICS() + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM popcount instructions.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index cfd654d291..da04f8813d 100644 --- a/meson.build +++ b/meson.build @@ -2194,6 +2194,34 @@ int main(void) endif +############################################################### +# Check for the availability of ARM SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include + +#if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main(void) +{ + const svuint64_t val = svdup_u64(0xFFFFFFFFFFFFFFFF); + svuint64_t popcnt = svcntb(val); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +''' + + if cc.links(prog, name: 'ARM SVE pop count', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798ab..29c32bbbbe 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index f8d6fb50b6..3a09bb5d16 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,6 +298,14 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * On AArch64, try using SVE popcount instructions, but only if + * we can verify that the CPU supports it via a runtime check. + */ +#if defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) +#define TRY_POPCNT_FAST 1 +#endif + #ifdef TRY_POPCNT_FAST /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); @@ -317,6 +325,12 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern bool pg_popcount_sve_available(void); +extern uint64 pg_popcount_sve(const char *buf, int bytes); +extern uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c22431951..61a8bcec15 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -45,6 +45,7 @@ OBJS = \ path.o \ pg_bitutils.o \ pg_popcount_avx512.o \ + pg_popcount_sve.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d4..4a3429c21a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,6 +8,7 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcount_avx512.c', + 'pg_popcount_sve.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 5677525693..df7cf429c5 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -135,7 +135,9 @@ pg_popcount_available(void) { unsigned int exx[4] = {0, 0, 0, 0}; -#if defined(HAVE__GET_CPUID) +#if defined(__aarch64__) + return false; /* cpuid not available in __aarch64__ */ +#elif defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) __cpuid(exx, 1); @@ -176,6 +178,12 @@ choose_popcount_functions(void) pg_popcount_optimized = pg_popcount_avx512; pg_popcount_masked_optimized = pg_popcount_masked_avx512; } +#elif USE_SVE_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } #endif } diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c new file mode 100644 index 0000000000..eea3790c32 --- /dev/null +++ b/src/port/pg_popcount_sve.c @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_sve.c + * Holds the SVE pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_sve.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "port/pg_bitutils.h" + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +#include +#include + +/* + * Returns true if the CPU supports the instructions required for the SVE + * pg_popcount() implementation. + */ +bool +pg_popcount_sve_available(void) +{ + return getauxval(AT_HWCAP) & HWCAP_SVE; +} + +/* + * pg_popcount_sve + * Returns the number of 1-bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svbool_t pred; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + pre_align, + loop_bytes; + uint64 popcnt = 0; + const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf); + + /* + * For smaller inputs, aligning the buffer degrades the performance. + * Therefore, the buffers only when the input size is sufficiently large. + */ + if (aligned != buf && bytes > 4 * vec_len) + { + pre_align = aligned + sizeof(uint64_t) - buf; + pred = svwhilelt_b8(0U, pre_align); + popcnt = svaddv(pred, svcnt_z(pred, svld1(pred, (const uint8 *) buf))); + buf += pre_align; + bytes -= pre_align; + } + + pred = svptrue_b64(); + loop_bytes = bytes & ~(vec_len * 2 - 1); + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svld1(pred, (const uint64 *) (buf + i)); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svld1(pred, (const uint64 *) (buf + i + vec_len)); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vector */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + popcnt += svaddv(pred, svcnt_z(pred, svld1(pred, (const uint8 *) (buf + i)))); + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns the number of 1-bits in buf after applying the mask + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + svbool_t pred; + svuint8_t vec8; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + pre_align, + loop_bytes; + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf); + + /* + * For smaller inputs, aligning the buffer degrades the performance. + * Therefore, the buffers only when the input size is sufficiently large. + */ + if (aligned != buf && bytes > 4 * vec_len) + { + pre_align = aligned + sizeof(uint64_t) - buf; + pred = svwhilelt_b8(0U, pre_align); + vec8 = svand_n_u8_m(pred, svld1(pred, (const uint8 *) buf), mask); /* load and mask */ + popcnt = svaddv(pred, svcnt_z(pred, vec8)); + buf += pre_align; + bytes -= pre_align; + } + + pred = svptrue_b64(); + loop_bytes = bytes & ~(vec_len * 2 - 1); + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svand_n_u64_x(pred, svld1(pred, (const uint64 *) (buf + i)), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svand_n_u64_x(pred, svld1(pred, (const uint64 *) (buf + i + vec_len)), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vectors */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + vec8 = svand_n_u8_m(pred, svld1(pred, (const uint8 *) (buf + i)), mask); + popcnt += svaddv(pred, svcnt_z(pred, vec8)); + } + + return popcnt; +} + +#endif /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -- 2.34.1