From 45c9e42b317eb8d53d37536253c673db5362d775 Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Thu, 9 Jan 2025 12:22:00 +0530 Subject: [PATCH v1] SVE support for hex encode and hex decode --- config/c-compiler.m4 | 53 ++++++++ configure | 63 ++++++++++ configure.ac | 9 ++ meson.build | 47 +++++++ src/backend/utils/adt/encode.c | 222 +++++++++++++++++++++++++++++++++ src/include/pg_config.h.in | 3 + 6 files changed, 397 insertions(+) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..bb22ceed17 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,56 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_HEX_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE intrinsic required for hex coding: +# svld1, svtbl, svsel, etc. +# +# If the intrinsics are supported, sets pgac_arm_sve_hex_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_HEX_INTRINSICS], +[ + AC_CACHE_CHECK([for svtbl, svlsr_z, svand_z, svcreate2, svst2, svsel and svget2 intrinsics], + [pgac_cv_arm_sve_hex_intrinsics], + [ + + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + + [ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; + ])], + [pgac_cv_arm_sve_hex_intrinsics=yes], + [pgac_cv_arm_sve_hex_intrinsics=no]) + + ]) + + if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then + pgac_arm_sve_hex_intrinsics = yes + fi +]) diff --git a/configure b/configure index a0b5e10ca3..7e0c0e4c05 100755 --- a/configure +++ b/configure @@ -17159,6 +17159,69 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check SVE intrinsics for hex coding +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SVE intrinsic svtbl, svlsr_z, etc." >&5 + $as_echo_n "checking for SVE intrinsic svtbl, svlsr_z... " >&6; } +if ${pgac_cv_arm_sve_hex_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int +main () +{ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_hex_intrinsics=yes +else + pgac_cv_arm_sve_hex_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_hex_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_hex_intrinsics" >&6; } + +if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then + PGAC_ARM_SVE_HEX_INTRINSICS=yes +fi + +if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then + $as_echo "#define USE_SVE_WITH_RUNTIME_CHECK 1" >>confdefs.h +fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index d713360f34..cc805667b9 100644 --- a/configure.ac +++ b/configure.ac @@ -2021,6 +2021,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE intrinsics for hex coding +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_HEX_INTRINSICS() + if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then + AC_DEFINE(USE_SVE_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM SVE intrinsic for hex coding.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index cfd654d291..a0ee05bad0 100644 --- a/meson.build +++ b/meson.build @@ -2194,6 +2194,53 @@ int main(void) endif +############################################################### +# Check the availability of ARM SVE intrinsics for hex coding. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include +#if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main(void) +{ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; +} +''' + + if cc.links(prog, name: 'ARM SVE hex encoding', args: test_c_args) + cdata.set('USE_SVE_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 4a6fcb56cd..b4a78cc4e4 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -20,6 +20,10 @@ #include "utils/memutils.h" #include "varatt.h" +#ifdef USE_SVE_WITH_RUNTIME_CHECK +#include +#include +#endif /* * Encoding conversion API. @@ -158,8 +162,106 @@ static const int8 hexlookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; +#ifdef USE_SVE_WITH_RUNTIME_CHECK +static uint64 hex_encode_slow(const char *src, size_t len, char *dst); +static uint64 hex_decode_slow(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_slow(const char *src, size_t len, char *dst, + Node *escontext); +static uint64 hex_encode_sve(const char *src, size_t len, char *dst); +static uint64 hex_decode_sve(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_sve(const char *src, size_t len, char *dst, + Node *escontext); +static uint64 hex_encode_choose(const char *src, size_t len, char *dst); +static uint64 hex_decode_choose(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_choose(const char *src, size_t len, char *dst, + Node *escontext); +uint64 (*hex_encode_optimized) + (const char *src, size_t len, char *dst) = hex_encode_choose; +uint64 (*hex_decode_optimized) + (const char *src, size_t len, char *dst) = hex_decode_choose; +uint64 (*hex_decode_safe_optimized) + (const char *src, size_t len, char *dst, Node *escontext) = + hex_decode_safe_choose; + +/* + * Returns true if the CPU supports SVE instructions. + */ +static inline bool +check_sve_support(void) +{ + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +} + +static inline void +choose_hex_functions(void) +{ + if (check_sve_support()) + { + hex_encode_optimized = hex_encode_sve; + hex_decode_optimized = hex_decode_sve; + hex_decode_safe_optimized = hex_decode_safe_sve; + } + else + { + hex_encode_optimized = hex_encode_slow; + hex_decode_optimized = hex_decode_slow; + hex_decode_safe_optimized = hex_decode_safe_slow; + } +} + +static uint64 +hex_encode_choose(const char *src, size_t len, char *dst) +{ + choose_hex_functions(); + return hex_encode_optimized(src, len, dst); +} + +static uint64 +hex_decode_choose(const char *src, size_t len, char *dst) +{ + choose_hex_functions(); + return hex_decode_optimized(src, len, dst); +} + +static uint64 +hex_decode_safe_choose(const char *src, size_t len, char *dst, Node *escontext) +{ + choose_hex_functions(); + return hex_decode_safe_optimized(src, len, dst, escontext); +} + uint64 hex_encode(const char *src, size_t len, char *dst) +{ + if (len < 16) + return hex_encode_slow(src, len, dst); + return hex_encode_optimized(src, len, dst); +} + +uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + if (len < 32) + return hex_decode_slow(src, len, dst); + return hex_decode_optimized(src, len, dst); +} + +uint64 +hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +{ + if (len < 32) + return hex_decode_safe_slow(src, len, dst, escontext); + return hex_decode_safe_optimized(src, len, dst, escontext); +} +#endif /* USE_SVE_WITH_RUNTIME_CHECK */ + +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_encode_slow(const char *src, size_t len, char *dst) +#else +uint64 +hex_encode(const char *src, size_t len, char *dst) +#endif { const char *end = src + len; @@ -186,14 +288,24 @@ get_hex(const char *cp, char *out) return (res >= 0); } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_decode_slow(const char *src, size_t len, char *dst) +#else uint64 hex_decode(const char *src, size_t len, char *dst) +#endif { return hex_decode_safe(src, len, dst, NULL); } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_decode_safe_slow(const char *src, size_t len, char *dst, Node *escontext) +#else uint64 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +#endif { const char *s, *srcend; @@ -233,6 +345,116 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) return p - dst; } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +/* + * ARM SVE implementation of hex_encode and hex_decode. + */ + +pg_attribute_target("arch=armv8-a+sve") +uint64 +hex_encode_sve(const char *src, size_t len, char *dst) +{ + const char hextbl[] = "0123456789abcdef"; + svbool_t pred; + svuint8_t bytes, + high, + low, + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8 *) hextbl); + svuint8x2_t merged; + uint32 vec_len = svcntb(); + + for (size_t i = 0; i < len; i += vec_len) + { + pred = svwhilelt_b8(i, len); + bytes = svld1(pred, (uint8 *) src); + high = svlsr_z(pred, bytes, 4); /* high nibble of the byte */ + low = svand_z(pred, bytes, 0xF); /* low nibble of the byte */ + + /* merge the high and low nibbles after converting to hex and */ + merged = svcreate2(svtbl(hextbl_vec, high), svtbl(hextbl_vec, low)); + svst2(pred, (uint8 *) dst, merged); + + dst += 2 * vec_len; + src += vec_len; + } + + return (uint64) len * 2; +} + +pg_attribute_target("arch=armv8-a+sve") +static inline bool +get_hex_sve(svbool_t pred, svuint8_t vec, svuint8_t *res) +{ + svuint8_t dgt_vec = svsub_x(pred, vec, 48), + cap_vec = svsub_x(pred, vec, 55), + sml_vec = svsub_x(pred, vec, 87), + alpha_vec; + svbool_t dgt_bool = svcmplt(pred, dgt_vec, 10), + cap_bool = svcmplt(pred, cap_vec, 16), + valid_alpha; + + alpha_vec = svsel(cap_bool, cap_vec, sml_vec); + valid_alpha = svand_z(pred, svcmpgt(pred, alpha_vec, 9), + svcmplt(pred, alpha_vec, 16)); + + if (svptest_any(pred, svnot_z(pred, svorr_z(pred, dgt_bool, valid_alpha)))) + return false; /* invalid hex digit */ + + *res = svsel(dgt_bool, dgt_vec, alpha_vec); + return true; +} + +uint64 +hex_decode_sve(const char *src, size_t len, char *dst) +{ + return hex_decode_safe_sve(src, len, dst, NULL); +} + +pg_attribute_target("arch=armv8-a+sve") +uint64 +hex_decode_safe_sve(const char *src, size_t len, char *dst, Node *escontext) +{ + svbool_t pred; + svuint8x2_t bytes; + svuint8_t high, + low; + uint32 processed; + size_t i = 0, + loop_bytes = len & ~31; + const char *p = dst; + + while (i < loop_bytes) + { + pred = svwhilelt_b8(i / 2, len / 2); + bytes = svld2(pred, (uint8 *) src); + high = svget2(bytes, 0); /* hex digit for high nibble */ + low = svget2(bytes, 1); /* hex digit for low nibble */ + + /* fall back if ASCII less than '0' is found */ + if (svptest_any(pred, svorr_z(pred, svcmplt(pred, high, '0'), + svcmplt(pred, low, '0')))) + break; + + if (!get_hex_sve(pred, high, &high) || !get_hex_sve(pred, low, &low)) + ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal digit"))); + + /* combine high and low nibble to form the byte and store in dst */ + svst1(pred, (uint8 *) dst, svorr_x(pred, svlsl_x(pred, high, 4), low)); + + processed = svcntp_b8(pred, pred) * 2; + src += processed; + i += processed; + dst += processed / 2; + } + + if (i < len) /* fall back */ + return dst - p + hex_decode_safe_slow(src, len - i, dst, escontext); + + return dst - p; +} +#endif /* USE_SVE_WITH_RUNTIME_CHECK */ + static uint64 hex_enc_len(const char *src, size_t srclen) { diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798ab..b5096c11f4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE instructions for hex coding with a runtime check. */ +#undef USE_SVE_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR -- 2.34.1