From 2094bc7f60db93693f2c054e9044d8baa128bb8f Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Wed, 22 Jan 2025 15:52:40 +0530 Subject: [PATCH v2] SVE support for hex encode and hex decode --- config/c-compiler.m4 | 53 ++++++++ configure | 63 +++++++++ configure.ac | 9 ++ meson.build | 47 +++++++ src/backend/utils/adt/encode.c | 241 +++++++++++++++++++++++++++++++++ src/include/pg_config.h.in | 3 + 6 files changed, 416 insertions(+) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..bb22ceed17 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,56 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_HEX_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE intrinsic required for hex coding: +# svld1, svtbl, svsel, etc. +# +# If the intrinsics are supported, sets pgac_arm_sve_hex_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_HEX_INTRINSICS], +[ + AC_CACHE_CHECK([for svtbl, svlsr_z, svand_z, svcreate2, svst2, svsel and svget2 intrinsics], + [pgac_cv_arm_sve_hex_intrinsics], + [ + + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + + [ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; + ])], + [pgac_cv_arm_sve_hex_intrinsics=yes], + [pgac_cv_arm_sve_hex_intrinsics=no]) + + ]) + + if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then + pgac_arm_sve_hex_intrinsics = yes + fi +]) diff --git a/configure b/configure index ceeef9b091..e634feec02 100755 --- a/configure +++ b/configure @@ -17168,6 +17168,69 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check SVE intrinsics for hex coding +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SVE intrinsic svtbl, svlsr_z, etc." >&5 + $as_echo_n "checking for SVE intrinsic svtbl, svlsr_z... " >&6; } +if ${pgac_cv_arm_sve_hex_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int +main () +{ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_hex_intrinsics=yes +else + pgac_cv_arm_sve_hex_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_hex_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_hex_intrinsics" >&6; } + +if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then + PGAC_ARM_SVE_HEX_INTRINSICS=yes +fi + +if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then + $as_echo "#define USE_SVE_WITH_RUNTIME_CHECK 1" >>confdefs.h +fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index d713360f34..cc805667b9 100644 --- a/configure.ac +++ b/configure.ac @@ -2021,6 +2021,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE intrinsics for hex coding +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_HEX_INTRINSICS() + if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then + AC_DEFINE(USE_SVE_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM SVE intrinsic for hex coding.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 32fc89f3a4..d9d13b3c55 100644 --- a/meson.build +++ b/meson.build @@ -2194,6 +2194,53 @@ int main(void) endif +############################################################### +# Check the availability of ARM SVE intrinsics for hex coding. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include +#if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main(void) +{ + char input[64] = {0}; + char output[64] = {0}; + svbool_t pred = svptrue_b8(), cmp1, cmp2; + svuint8_t bytes, hextbl_vec; + svuint8x2_t merged; + + /* intrinsics used in hex_encode_sve */ + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF"); + bytes = svld1(pred, (uint8_t *) input); + bytes = svlsr_z(pred, bytes, 4); + bytes = svand_z(pred, bytes, 0xF); + merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes)); + svst2(pred, (uint8_t *) output, merged); + + /* intrinsics used in hex_decode_sve */ + bytes = svget2(svld2(pred, (uint8_t *) output), 0); + bytes = svsub_x(pred, bytes, bytes); + cmp1 = svcmplt(pred, bytes, 0); + cmp2 = svcmpgt(pred, bytes, 0); + bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes); + svst1(pred, output, bytes); + + /* return computed value, to prevent the above being optimized away */ + return output[0] == 0; +} +''' + + if cc.links(prog, name: 'ARM SVE hex encoding', args: test_c_args) + cdata.set('USE_SVE_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 4ccaed815d..0fe41a8d00 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -20,6 +20,10 @@ #include "utils/memutils.h" #include "varatt.h" +#ifdef USE_SVE_WITH_RUNTIME_CHECK +#include +#include +#endif /* * Encoding conversion API. @@ -177,8 +181,106 @@ static const int8 hexlookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; +#ifdef USE_SVE_WITH_RUNTIME_CHECK +static uint64 hex_encode_slow(const char *src, size_t len, char *dst); +static uint64 hex_decode_slow(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_slow(const char *src, size_t len, char *dst, + Node *escontext); +static uint64 hex_encode_sve(const char *src, size_t len, char *dst); +static uint64 hex_decode_sve(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_sve(const char *src, size_t len, char *dst, + Node *escontext); +static uint64 hex_encode_choose(const char *src, size_t len, char *dst); +static uint64 hex_decode_choose(const char *src, size_t len, char *dst); +static uint64 hex_decode_safe_choose(const char *src, size_t len, char *dst, + Node *escontext); +uint64 (*hex_encode_optimized) + (const char *src, size_t len, char *dst) = hex_encode_choose; +uint64 (*hex_decode_optimized) + (const char *src, size_t len, char *dst) = hex_decode_choose; +uint64 (*hex_decode_safe_optimized) + (const char *src, size_t len, char *dst, Node *escontext) = + hex_decode_safe_choose; + +/* + * Returns true if the CPU supports SVE instructions. + */ +static inline bool +check_sve_support(void) +{ + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +} + +static inline void +choose_hex_functions(void) +{ + if (check_sve_support()) + { + hex_encode_optimized = hex_encode_sve; + hex_decode_optimized = hex_decode_sve; + hex_decode_safe_optimized = hex_decode_safe_sve; + } + else + { + hex_encode_optimized = hex_encode_slow; + hex_decode_optimized = hex_decode_slow; + hex_decode_safe_optimized = hex_decode_safe_slow; + } +} + +static uint64 +hex_encode_choose(const char *src, size_t len, char *dst) +{ + choose_hex_functions(); + return hex_encode_optimized(src, len, dst); +} + +static uint64 +hex_decode_choose(const char *src, size_t len, char *dst) +{ + choose_hex_functions(); + return hex_decode_optimized(src, len, dst); +} + +static uint64 +hex_decode_safe_choose(const char *src, size_t len, char *dst, Node *escontext) +{ + choose_hex_functions(); + return hex_decode_safe_optimized(src, len, dst, escontext); +} + +uint64 +hex_encode(const char *src, size_t len, char *dst) +{ + if (len < 16) + return hex_encode_slow(src, len, dst); + return hex_encode_optimized(src, len, dst); +} + +uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + if (len < 32) + return hex_decode_slow(src, len, dst); + return hex_decode_optimized(src, len, dst); +} + +uint64 +hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +{ + if (len < 32) + return hex_decode_safe_slow(src, len, dst, escontext); + return hex_decode_safe_optimized(src, len, dst, escontext); +} +#endif /* USE_SVE_WITH_RUNTIME_CHECK */ + +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_encode_slow(const char *src, size_t len, char *dst) +#else uint64 hex_encode(const char *src, size_t len, char *dst) +#endif { const char *end = src + len; @@ -207,14 +309,24 @@ get_hex(const char *cp, char *out) return (res >= 0); } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_decode_slow(const char *src, size_t len, char *dst) +#else uint64 hex_decode(const char *src, size_t len, char *dst) +#endif { return hex_decode_safe(src, len, dst, NULL); } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +uint64 +hex_decode_safe_slow(const char *src, size_t len, char *dst, Node *escontext) +#else uint64 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) +#endif { const char *s, *srcend; @@ -254,6 +366,135 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext) return p - dst; } +#ifdef USE_SVE_WITH_RUNTIME_CHECK +/* + * SVE implementation of hex_encode and hex_decode. + */ + +pg_attribute_target("arch=armv8-a+sve") +uint64 +hex_encode_sve(const char *src, size_t len, char *dst) +{ + const char hextbl[] = "0123456789abcdef"; + svbool_t pred; + svuint8_t bytes, + high, + low, + hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8 *) hextbl); + svuint8x2_t merged; + uint32 vec_len = svcntb(); + + for (size_t i = 0; i < len; i += vec_len) + { + pred = svwhilelt_b8(i, len); + bytes = svld1(pred, (uint8 *) src); + high = svlsr_z(pred, bytes, 4); /* shift-right to get the high nibble */ + low = svand_z(pred, bytes, 0xF); /* mask high to get the low nibble */ + + /* + * Convert the nibbles to hex digits by indexing into hextbl_vec, + * for example, a nibble value of 10 indexed into hextbl_vec gives 'a'. + * Finally, interleave the high and low nibbles + */ + merged = svcreate2(svtbl(hextbl_vec, high), svtbl(hextbl_vec, low)); + svst2(pred, (uint8 *) dst, merged); + + dst += 2 * vec_len; + src += vec_len; + } + + return (uint64) len * 2; +} + +pg_attribute_target("arch=armv8-a+sve") +static inline bool +get_hex_sve(svbool_t pred, svuint8_t vec, svuint8_t *res) +{ + /* + * Convert ASCII values '0'-'9' to integers 0-9 by subtracting 48. + * Similarly, convert letters 'A'-'F' and 'a'-'f' to integers 10-15. + */ + svuint8_t dgt_vec = svsub_x(pred, vec, 48), + cap_vec = svsub_x(pred, vec, 55), + sml_vec = svsub_x(pred, vec, 87), + letter_vec; + /* + * Identify valid integers in dgt_vec, cap_vec, and sml_vec. + * Values 0-9 are valid in dgt_vec, while values 10-15 are valid + * in cap_vec and sml_vec. + */ + svbool_t dgt_bool = svcmplt(pred, dgt_vec, 10), + cap_bool = svcmplt(pred, cap_vec, 16), + letter_bool; + /* + * Combine cap_vec and sml_vec and mark the valid range 10-15. + */ + letter_vec = svsel(cap_bool, cap_vec, sml_vec); + letter_bool = svand_z(pred, svcmpgt(pred, letter_vec, 9), + svcmplt(pred, letter_vec, 16)); + /* + * Check for invalid hexadecimal digits. Each value must fall + * within the range 0-9 (true in dgt_bool) or 10-15 (true in letter_bool). + */ + if (svptest_any(pred, svnot_z(pred, svorr_z(pred, dgt_bool, letter_bool)))) + return false; + + /* Finally, combine dgt_vec and letter_vec */ + *res = svsel(dgt_bool, dgt_vec, letter_vec); + return true; +} + +uint64 +hex_decode_sve(const char *src, size_t len, char *dst) +{ + return hex_decode_safe_sve(src, len, dst, NULL); +} + +pg_attribute_target("arch=armv8-a+sve") +uint64 +hex_decode_safe_sve(const char *src, size_t len, char *dst, Node *escontext) +{ + svbool_t pred; + svuint8x2_t bytes; + svuint8_t high, + low; + uint32 processed; + size_t i = 0, + loop_bytes = len & ~1; /* handles inputs of odd length */ + const char *p = dst; + + while (i < loop_bytes) + { + pred = svwhilelt_b8(i / 2, len / 2); + bytes = svld2(pred, (uint8 *) src); + high = svget2(bytes, 0); /* hex digit for high nibble */ + low = svget2(bytes, 1); /* hex digit for low nibble */ + + /* fall back if ASCII less than '0' is found */ + if (svptest_any(pred, svorr_z(pred, svcmplt(pred, high, '0'), + svcmplt(pred, low, '0')))) + break; + + /* fall back if invalid hexadecimal digit is found */ + if (!get_hex_sve(pred, high, &high) || !get_hex_sve(pred, low, &low)) + break; + + /* left-shift high and perform bitwise OR with low to form the byte */ + svst1(pred, (uint8 *) dst, svorr_x(pred, svlsl_x(pred, high, 4), low)); + + processed = svcntp_b8(pred, pred) * 2; + src += processed; + i += processed; + dst += processed / 2; + } + + if (i < len) /* fall back */ + return dst - p + hex_decode_safe_slow(src, len - i, dst, escontext); + + return dst - p; +} +#endif /* USE_SVE_WITH_RUNTIME_CHECK */ + static uint64 hex_enc_len(const char *src, size_t srclen) { diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798ab..b5096c11f4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE instructions for hex coding with a runtime check. */ +#undef USE_SVE_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR -- 2.34.1