Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Based on the discussion in PostgreSQL: Re: Proposal for Updating CRC32C with AVX-512 Algorithm.</messages/by-id/ZyziAXhsgfqakSI4@nathan>, this patch leverages pg_attribute_target to build the SSE42 CRC32C code using function attributes.
Raghuveer
Attachments:
v1-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchapplication/octet-stream; name=v1-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchDownload
From 6fce7c2c0340ab04c49e1d780b8cd37ef83914e7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v1] Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
config/c-compiler.m4 | 28 +++++------
configure | 82 ++++++++-----------------------
configure.ac | 8 +--
meson.build | 14 ++++--
src/port/Makefile | 5 --
src/port/meson.build | 10 ++--
src/port/pg_crc32c_sse42.c | 4 ++
src/port/pg_crc32c_sse42_choose.c | 4 ++
8 files changed, 61 insertions(+), 94 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..a52e41e02f 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,27 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ __attribute__((target("sse4.2")))
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
-
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bc989d9d72 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,47 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ __attribute__((target("sse4.2")))
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
- prog = '''
+ sse42_crc_prog = '''
#include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
args: test_c_args)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+ args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..c727fc60ae 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -23,6 +23,13 @@ pgport_sources = [
'tar.c',
]
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+ pgport_sources += files(
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
+ )
+endif
+
if host_system == 'windows'
pgport_sources += files(
'dirmod.c',
@@ -81,9 +88,6 @@ endif
# is true
replace_funcs_pos = [
# x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..52993c5157 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -18,7 +18,10 @@
#include "port/pg_crc32c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
@@ -67,3 +70,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..3ff715da63 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,6 +18,7 @@
*-------------------------------------------------------------------------
*/
+
#include "c.h"
#ifdef HAVE__GET_CPUID
@@ -30,6 +31,7 @@
#include "port/pg_crc32c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
static bool
pg_crc32c_sse42_available(void)
{
@@ -62,3 +64,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
--
2.43.0
+ __attribute__((target("sse4.2")))
These need to be surrounded with
#if defined(__has_attribute) && __has_attribute (target)
so that we still attempt the check on compilers that don't support it
(e.g., MSVC).
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
IIUC this means we will never set USE_SSE42_CRC32C_WITH_RUNTIME_CHECK. To
maintain the existing behavior, I think we need to still perform two
configure tests (one with __attribute__((target(...))) and another
without), and then we can choose whether to set USE_SSE42_CRC32C or
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK based on the results.
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
Can we combine these?
--
nathan
+ __attribute__((target("sse4.2")))
These need to be surrounded with
#if defined(__has_attribute) && __has_attribute (target)
so that we still attempt the check on compilers that don't support it (e.g., MSVC).
I assumed configure was only used on linux and ignored this check. Doesn't hurt to add this I suppose.
# Check for Intel SSE 4.2 intrinsics to do CRC calculations. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used -# with the default compiler flags. If not, check if adding the -msse4.2 -# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required. +# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used # +with the __attribute__((target("sse4.2"))). PGAC_SSE42_CRC32_INTRINSICS([]) -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - PGAC_SSE42_CRC32_INTRINSICS([-msse4.2]) -fiIIUC this means we will never set USE_SSE42_CRC32C_WITH_RUNTIME_CHECK.
I don't think so. USE_SSE42_CRC32C additionally requires SSE4_2_TARGETED to be true which will only happen when explicitly built with -msse4.2. When this explicit compiler flag is missing, we set USE_SSE42_CRC32C_WITH_RUNTIME_CHECK to true. This logic is further down in the configure.ac file:
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
else
To maintain the existing behavior, I think we need to still perform two configure
tests (one with __attribute__((target(...))) and another without), and then we can
choose whether to set USE_SSE42_CRC32C or
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK based on the results.+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',Can we combine these?
Knowing the AVX-512 will come next, I think it makes sense to keep the runtime choose function separate. Otherwise this gets polluted with runtime choose function, sse42 algorithm and the avx512 algorithm in the next patch. Does that make sense?
Raghuveer
On Thu, Nov 07, 2024 at 09:30:32PM +0000, Devulapalli, Raghuveer wrote:
# Check for Intel SSE 4.2 intrinsics to do CRC calculations. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used -# with the default compiler flags. If not, check if adding the -msse4.2 -# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required. +# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used # +with the __attribute__((target("sse4.2"))). PGAC_SSE42_CRC32_INTRINSICS([]) -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - PGAC_SSE42_CRC32_INTRINSICS([-msse4.2]) -fiIIUC this means we will never set USE_SSE42_CRC32C_WITH_RUNTIME_CHECK.
I don't think so. USE_SSE42_CRC32C additionally requires SSE4_2_TARGETED
to be true which will only happen when explicitly built with -msse4.2.
When this explicit compiler flag is missing, we set
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK to true. This logic is further down
in the configure.ac file:if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
else
Oh, you are right, sorry.
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',Can we combine these?
Knowing the AVX-512 will come next, I think it makes sense to keep the
runtime choose function separate. Otherwise this gets polluted with
runtime choose function, sse42 algorithm and the avx512 algorithm in the
next patch. Does that make sense??
Is the idea that we will put both "choose" functions in one file and the
actual CRC-32C code in another? I'm okay with that.
--
nathan
Here is the updated patch with the #if defined(__has_attribute) && __has_attribute (target) guard.
Oh, you are right, sorry.
No worries! 😊
Is the idea that we will put both "choose" functions in one file and the actual
CRC-32C code in another? I'm okay with that.
Yup, there can only be one choose function that has to pick between avx512, sse42 and scalar code.
Raghuveer
Attachments:
v2-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchapplication/octet-stream; name=v2-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchDownload
From 81a6715c63ef18cd879166ac840e23652ac5871e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v2] Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
config/c-compiler.m4 | 30 +++++------
configure | 84 +++++++++----------------------
configure.ac | 8 +--
meson.build | 14 ++++--
src/port/Makefile | 5 --
src/port/meson.build | 10 ++--
src/port/pg_crc32c_sse42.c | 4 ++
src/port/pg_crc32c_sse42_choose.c | 4 ++
8 files changed, 65 insertions(+), 94 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..113a5b22a1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,29 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
-
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bfdb46c9dc 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,49 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
- prog = '''
+ sse42_crc_prog = '''
#include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
args: test_c_args)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+ args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..c727fc60ae 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -23,6 +23,13 @@ pgport_sources = [
'tar.c',
]
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+ pgport_sources += files(
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
+ )
+endif
+
if host_system == 'windows'
pgport_sources += files(
'dirmod.c',
@@ -81,9 +88,6 @@ endif
# is true
replace_funcs_pos = [
# x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..52993c5157 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -18,7 +18,10 @@
#include "port/pg_crc32c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
@@ -67,3 +70,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..3ff715da63 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,6 +18,7 @@
*-------------------------------------------------------------------------
*/
+
#include "c.h"
#ifdef HAVE__GET_CPUID
@@ -30,6 +31,7 @@
#include "port/pg_crc32c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
static bool
pg_crc32c_sse42_available(void)
{
@@ -62,3 +64,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
--
2.43.0
- prog = '''
+ sse42_crc_prog = '''
nitpick: Why does this need to be renamed?
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
So, for meson builds, we do test with and without
__attribute___((target(..."))), but for autoconf builds, we check for
__SSE4_2__ to determine whether we need a runtime check. This difference
isn't the fault of your patch, but it's a little odd. That being said, I'm
not sure there's a problem with either approach.
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+ pgport_sources += files(
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
+ )
+endi
We could probably just build these unconditionally (i.e., put them in the
first list of pgport_sources in this file) instead of keeping this
complexity in the build scripts.
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
Can we move this to just below #include "c.h"? The reason I didn't do this
for the AVX-512 stuff is because TRY_POPCNT_FAST is defined in
pg_bitutils.h, but looking again, I may be able to at least move the check
for USE_AVX512_POPCNT_WITH_RUNTIME_CHECK up similarly.
--
nathan
- prog = ''' + sse42_crc_prog = '''nitpick: Why does this need to be renamed?
Doesn't need to be renamed, but just a better name to describe that the code is meant for. It will be followed up with avx512_crc_prog in the next patch.
+#ifdef TEST_SSE42_WITH_ATTRIBUTE +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("sse4.2"))) +#endif +#endifSo, for meson builds, we do test with and without __attribute___((target(..."))),
but for autoconf builds, we check for __SSE4_2__ to determine whether we need
a runtime check. This difference isn't the fault of your patch, but it's a little odd.
That being said, I'm not sure there's a problem with either approach.
I just realized, isn't this a problem on MSVC? When building with MSVC, USE_SSE42_CRC32C is always set to true (because MSVC doesn't require a specific SSE42 flag to build with): see https://gcc.godbolt.org/z/eoc1Ec33j. This means it is always running the SSE42 without a runtime check which can technically SEGILL on a really old CPU (SSE42 is nearly 18 years old though). This problem exists in the master branch too.
+if host_cpu == 'x86' or host_cpu == 'x86_64' + pgport_sources += files( + 'pg_crc32c_sse42_choose.c', + 'pg_crc32c_sse42.c', + ) +endiWe could probably just build these unconditionally (i.e., put them in the first list of
pgport_sources in this file) instead of keeping this complexity in the build scripts.
Sure.
+#if defined(USE_SSE42_CRC32C) || +defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)Can we move this to just below #include "c.h"?
Sounds good.
Raghuveer
On Fri, Nov 08, 2024 at 05:52:23PM +0000, Devulapalli, Raghuveer wrote:
So, for meson builds, we do test with and without __attribute___((target(..."))),
but for autoconf builds, we check for __SSE4_2__ to determine whether we need
a runtime check. This difference isn't the fault of your patch, but it's a little odd.
That being said, I'm not sure there's a problem with either approach.I just realized, isn't this a problem on MSVC? When building with MSVC,
USE_SSE42_CRC32C is always set to true (because MSVC doesn't require a
specific SSE42 flag to build with): see
https://gcc.godbolt.org/z/eoc1Ec33j. This means it is always running the
SSE42 without a runtime check which can technically SEGILL on a really
old CPU (SSE42 is nearly 18 years old though). This problem exists in the
master branch too.
I believe we expect MSVC builds to use meson at this point, which is
probably why there's this extra check:
if cc.get_id() == 'msvc'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
There used to be special MSVC scripts (removed in commit 1301c80), and it
looks like those just set USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
unconditionally, too.
--
nathan
I believe we expect MSVC builds to use meson at this point, which is probably
why there's this extra check:if cc.get_id() == 'msvc'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
Ah, I missed this. This makes sense.
V3: With the suggested changes.
Raghuveer
Show quoted text
-----Original Message-----
From: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Sent: Friday, November 8, 2024 10:43 AM
To: Nathan Bossart <nathandbossart@gmail.com>
Cc: pgsql-hackers@lists.postgresql.org; Shankaran, Akash
<akash.shankaran@intel.com>
Subject: RE: Use __attribute__((target(sse4.2))) for SSE42 CRC32CI believe we expect MSVC builds to use meson at this point, which is
probably why there's this extra check:if cc.get_id() == 'msvc'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = trueAh, I missed this. This makes sense.
Attachments:
v3-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchapplication/octet-stream; name=v3-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchDownload
From 6d3277787a6d9d6a626d87cfa37d05412bf8ef32 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v3] Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
config/c-compiler.m4 | 30 +++++------
configure | 84 +++++++++----------------------
configure.ac | 8 +--
meson.build | 14 ++++--
src/port/Makefile | 5 --
src/port/meson.build | 5 +-
src/port/pg_crc32c_sse42.c | 5 ++
src/port/pg_crc32c_sse42_choose.c | 5 ++
8 files changed, 62 insertions(+), 94 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..113a5b22a1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,29 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
-
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bfdb46c9dc 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,49 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
- prog = '''
+ sse42_crc_prog = '''
#include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
args: test_c_args)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+ args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..37d12cbd8f 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,8 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_popcount_avx512.c',
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
@@ -81,9 +83,6 @@ endif
# is true
replace_funcs_pos = [
# x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..37693eb516 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -14,11 +14,15 @@
*/
#include "c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#include <nmmintrin.h>
#include "port/pg_crc32c.h"
+
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
@@ -67,3 +71,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..50ae82b312 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,8 +18,11 @@
*-------------------------------------------------------------------------
*/
+
#include "c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#ifdef HAVE__GET_CPUID
#include <cpuid.h>
#endif
@@ -62,3 +65,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
--
2.43.0
Anymore feedback on this patch? Hoping this is a straightforward one.
Raghuveer
Show quoted text
-----Original Message-----
From: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Sent: Friday, November 8, 2024 11:05 AM
To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>; Nathan Bossart
<nathandbossart@gmail.com>
Cc: pgsql-hackers@lists.postgresql.org; Shankaran, Akash
<akash.shankaran@intel.com>
Subject: RE: Use __attribute__((target(sse4.2))) for SSE42 CRC32CV3: With the suggested changes.
Raghuveer
-----Original Message-----
From: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Sent: Friday, November 8, 2024 10:43 AM
To: Nathan Bossart <nathandbossart@gmail.com>
Cc: pgsql-hackers@lists.postgresql.org; Shankaran, Akash
<akash.shankaran@intel.com>
Subject: RE: Use __attribute__((target(sse4.2))) for SSE42 CRC32CI believe we expect MSVC builds to use meson at this point, which is
probably why there's this extra check:if cc.get_id() == 'msvc'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = trueAh, I missed this. This makes sense.
On Wed, Nov 20, 2024 at 05:37:33PM +0000, Devulapalli, Raghuveer wrote:
Anymore feedback on this patch? Hoping this is a straightforward one.
I think it is in pretty good shape. After a read-through, the only thing
that stands out to me is the difference in configure tests between autoconf
and meson. I think we should consider picking one approach, although I'm
not sure I have a strong preference.
--
nathan
I think it is in pretty good shape. After a read-through, the only thing that stands
out to me is the difference in configure tests between autoconf and meson. I
think we should consider picking one approach, although I'm not sure I have a
strong preference.
Thanks for the review! We can actually leverage meson's built in option to check for a macro: cc.get_define('__SSE4_2__') != ''. This should keep the logic consistent across configure and meson.
Attachments:
v4-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchapplication/octet-stream; name=v4-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchDownload
From 6d3277787a6d9d6a626d87cfa37d05412bf8ef32 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v4 1/2] Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
config/c-compiler.m4 | 30 +++++------
configure | 84 +++++++++----------------------
configure.ac | 8 +--
meson.build | 14 ++++--
src/port/Makefile | 5 --
src/port/meson.build | 5 +-
src/port/pg_crc32c_sse42.c | 5 ++
src/port/pg_crc32c_sse42_choose.c | 5 ++
8 files changed, 62 insertions(+), 94 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..113a5b22a1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,29 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
-
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bfdb46c9dc 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,49 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
- prog = '''
+ sse42_crc_prog = '''
#include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
args: test_c_args)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+ args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..37d12cbd8f 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,8 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_popcount_avx512.c',
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
@@ -81,9 +83,6 @@ endif
# is true
replace_funcs_pos = [
# x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..37693eb516 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -14,11 +14,15 @@
*/
#include "c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#include <nmmintrin.h>
#include "port/pg_crc32c.h"
+
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
@@ -67,3 +71,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..50ae82b312 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,8 +18,11 @@
*-------------------------------------------------------------------------
*/
+
#include "c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#ifdef HAVE__GET_CPUID
#include <cpuid.h>
#endif
@@ -62,3 +65,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
--
2.43.0
v4-0002-Use-consistent-logic-in-configure-and-meson-to-ge.patchapplication/octet-stream; name=v4-0002-Use-consistent-logic-in-configure-and-meson-to-ge.patchDownload
From 6fc3adb6e496c3186c7bda7251060e15a8e0946a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 21 Nov 2024 14:59:29 -0800
Subject: [PATCH v4 2/2] Use consistent logic in configure and meson to
generate USE_SSE42_CRC32C
---
meson.build | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/meson.build b/meson.build
index 671ac7052b..5f199b4b7c 100644
--- a/meson.build
+++ b/meson.build
@@ -2236,11 +2236,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
sse42_crc_prog = '''
#include <nmmintrin.h>
-#ifdef TEST_SSE42_WITH_ATTRIBUTE
#if defined(__has_attribute) && __has_attribute (target)
__attribute__((target("sse4.2")))
#endif
-#endif
int main(void)
{
unsigned int crc = 0;
@@ -2251,13 +2249,12 @@ int main(void)
}
'''
- if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
- args: test_c_args)
+ if cc.get_define('__SSE4_2__') != ''
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
- args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
+ args: test_c_args)
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
--
2.43.0
On Thu, Nov 21, 2024 at 11:07:59PM +0000, Devulapalli, Raghuveer wrote:
Thanks for the review! We can actually leverage meson's built in option
to check for a macro: cc.get_define('__SSE4_2__') != ''. This should keep
the logic consistent across configure and meson.
I think we should still use the test program even when __SSE4_2__ is
defined, but we can use that macro to determine whether to use a runtime
check. I think that would keep autoconf and meson consistent.
--
nathan
I think we should still use the test program even when __SSE4_2__ is defined, but
we can use that macro to determine whether to use a runtime check. I think that
would keep autoconf and meson consistent.
Sure. Updated patch.
Attachments:
v5-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchapplication/octet-stream; name=v5-0001-Use-__attribute__-target-sse4.2-for-SSE42-CRC32C.patchDownload
From 6d3277787a6d9d6a626d87cfa37d05412bf8ef32 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v5 1/2] Use __attribute__((target(sse4.2))) for SSE42 CRC32C
Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
config/c-compiler.m4 | 30 +++++------
configure | 84 +++++++++----------------------
configure.ac | 8 +--
meson.build | 14 ++++--
src/port/Makefile | 5 --
src/port/meson.build | 5 +-
src/port/pg_crc32c_sse42.c | 5 ++
src/port/pg_crc32c_sse42_choose.c | 5 ++
8 files changed, 62 insertions(+), 94 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..113a5b22a1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,29 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
-
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bfdb46c9dc 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,49 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
- prog = '''
+ sse42_crc_prog = '''
#include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
args: test_c_args)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+ args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..37d12cbd8f 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,8 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_popcount_avx512.c',
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
@@ -81,9 +83,6 @@ endif
# is true
replace_funcs_pos = [
# x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..37693eb516 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -14,11 +14,15 @@
*/
#include "c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#include <nmmintrin.h>
#include "port/pg_crc32c.h"
+
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
@@ -67,3 +71,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..50ae82b312 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,8 +18,11 @@
*-------------------------------------------------------------------------
*/
+
#include "c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
#ifdef HAVE__GET_CPUID
#include <cpuid.h>
#endif
@@ -62,3 +65,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
--
2.43.0
v5-0002-Use-consistent-logic-in-configure-and-meson-to-ge.patchapplication/octet-stream; name=v5-0002-Use-consistent-logic-in-configure-and-meson-to-ge.patchDownload
From f141ad84cdda8758d60dd5655024e9d2be662da7 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 21 Nov 2024 14:59:29 -0800
Subject: [PATCH v5 2/2] Use consistent logic in configure and meson to
generate USE_SSE42_CRC32C
---
meson.build | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/meson.build b/meson.build
index 671ac7052b..c04c250631 100644
--- a/meson.build
+++ b/meson.build
@@ -2236,11 +2236,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
sse42_crc_prog = '''
#include <nmmintrin.h>
-#ifdef TEST_SSE42_WITH_ATTRIBUTE
#if defined(__has_attribute) && __has_attribute (target)
__attribute__((target("sse4.2")))
#endif
-#endif
int main(void)
{
unsigned int crc = 0;
@@ -2251,16 +2249,16 @@ int main(void)
}
'''
- if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
- args: test_c_args)
+ canbuild_sse42_crc = cc.links(sse42_crc_prog,
+ name: 'Compile CRC32 SSE4.2 intrinsics with function attribute',
+ args: test_c_args)
+ if (cc.get_define('__SSE4_2__') != '' and canbuild_sse42_crc)
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
- args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
+ elif canbuild_sse42_crc
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
- cflags_crc += '-msse4.2'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
--
2.43.0
On Fri, Nov 22, 2024 at 09:00:01PM +0000, Devulapalli, Raghuveer wrote:
Sure. Updated patch.
Thanks. I think my only remaining feedback is that we should probably add
a comment to explain why we aren't doing this for ARM yet [0]/messages/by-id/ZwXsE0KG_wh6_heU@nathan.
[0]: /messages/by-id/ZwXsE0KG_wh6_heU@nathan
--
nathan
Thanks. I think my only remaining feedback is that we should probably add a
comment to explain why we aren't doing this for ARM yet [0].
Sounds good. Where would you like me to add this comment? Meson.build and configure?
On Mon, Nov 25, 2024 at 11:28:52PM +0000, Devulapalli, Raghuveer wrote:
Thanks. I think my only remaining feedback is that we should probably add a
comment to explain why we aren't doing this for ARM yet [0].Sounds good. Where would you like me to add this comment? Meson.build and
configure?
I'll take care of it.
--
nathan
Here's what I have staged for commit (except for the commit message, which
needs some work).
--
nathan
Attachments:
v6-0001-Use-__attribute__-target-.-for-SSE4.2-support.patchtext/plain; charset=us-asciiDownload
From c405f5a41cca9ce5de9c0e69e00d11b16ba93986 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Tue, 26 Nov 2024 14:17:03 -0600
Subject: [PATCH v6 1/1] Use __attribute__((target(...))) for SSE4.2 support.
Author: Raghuveer Devulapalli
Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
---
config/c-compiler.m4 | 32 +++++++------
configure | 93 ++++++++++++--------------------------
configure.ac | 19 ++++----
meson.build | 22 ++++++---
src/port/Makefile | 5 --
src/port/meson.build | 2 +-
src/port/pg_crc32c_sse42.c | 1 +
7 files changed, 72 insertions(+), 102 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index a129edb88e..309d5b04b4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -605,24 +605,26 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
# the other ones are, on x86-64 platforms)
#
-# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
+# If the intrinsics are supported, sets pgac_sse42_crc32_intrinsics.
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
- [unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }],
+ [return crc32_sse42_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_CRC="$1"
pgac_sse42_crc32_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
diff --git a/configure b/configure
index 199d666aa7..cb89866649 100755
--- a/configure
+++ b/configure
@@ -17375,87 +17375,47 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
-else
- pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
- CFLAGS_CRC=""
- pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <nmmintrin.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("sse4.2")))
+ #endif
+ static int crc32_sse42_test(void)
+ {
+ unsigned int crc = 0;
+ crc = _mm_crc32_u8(crc, 0);
+ crc = _mm_crc32_u32(crc, 0);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ }
int
main ()
{
-unsigned int crc = 0;
- crc = _mm_crc32_u8(crc, 0);
- crc = _mm_crc32_u32(crc, 0);
- /* return computed value, to prevent the above being optimized away */
- return crc == 0;
+return crc32_sse42_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_crc32_intrinsics=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_crc32_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
- CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
pgac_sse42_crc32_intrinsics=yes
fi
-fi
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
@@ -17658,15 +17618,20 @@ fi
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
# use the special CRC instructions for calculating CRC-32C. If we're not
# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
#
# Similarly, if we are targeting an ARM processor that has the CRC
# instructions that are part of the ARMv8 CRC Extension, use them. And if
# we're not targeting such a processor, but can nevertheless produce code that
# uses the CRC instructions, compile both, and select at runtime.
#
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags. Perhaps we can re-evaluate this decision after some
+# time has passed.
+#
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
#
diff --git a/configure.ac b/configure.ac
index 4f56bb5062..5785600faf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2069,13 +2069,7 @@ fi
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
+PGAC_SSE42_CRC32_INTRINSICS()
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
@@ -2112,15 +2106,20 @@ AC_SUBST(CFLAGS_CRC)
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
# use the special CRC instructions for calculating CRC-32C. If we're not
# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
#
# Similarly, if we are targeting an ARM processor that has the CRC
# instructions that are part of the ARMv8 CRC Extension, use them. And if
# we're not targeting such a processor, but can nevertheless produce code that
# uses the CRC instructions, compile both, and select at runtime.
#
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags. Perhaps we can re-evaluate this decision after some
+# time has passed.
+#
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
#
diff --git a/meson.build b/meson.build
index 83e61d0f4a..d6dc8c8525 100644
--- a/meson.build
+++ b/meson.build
@@ -2211,14 +2211,19 @@ endif
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
# use the special CRC instructions for calculating CRC-32C. If we're not
# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
#
# Similarly, if we are targeting an ARM processor that has the CRC
# instructions that are part of the ARMv8 CRC Extension, use them. And if
# we're not targeting such a processor, but can nevertheless produce code that
# uses the CRC instructions, compile both, and select at runtime.
+#
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags. Perhaps we can re-evaluate this decision after some
+# time has passed.
###############################################################
have_optimized_crc = false
@@ -2234,6 +2239,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
prog = '''
#include <nmmintrin.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
int main(void)
{
unsigned int crc = 0;
@@ -2244,16 +2252,16 @@ int main(void)
}
'''
- if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+ if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
args: test_c_args)
+ # Do not use Intel SSE 4.2
+ elif (cc.get_define('__SSE4_2__') != '')
# Use Intel SSE 4.2 unconditionally.
cdata.set('USE_SSE42_CRC32C', 1)
have_optimized_crc = true
- elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
- args: test_c_args + ['-msse4.2'])
+ else
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
# the runtime check.
- cflags_crc += '-msse4.2'
cdata.set('USE_SSE42_CRC32C', false)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..c5bceed9cd 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -82,7 +82,7 @@ endif
replace_funcs_pos = [
# x86/x64
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..dcc4904a82 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -19,6 +19,7 @@
#include "port/pg_crc32c.h"
pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
--
2.39.5 (Apple Git-154)
Here's what I have staged for commit (except for the commit message, which
needs some work).
LGTM.
Raghuveer