diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..354ab636da 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -87,6 +87,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# Newer Intel processors can use some AVX-512 Capabilities (11/01/2023) +pg_bitutils.o: CFLAGS+=$(CFLAGS_AVX512) +pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_AVX512) +pg_bitutils_srv.o:CFLAGS+=$(CFLAGS_AVX512) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 640a89561a..a0c91273ec 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -19,6 +19,10 @@ #include #endif +#if defined(HAVE__HW_AVX512_POPCNT) +#include +#endif + #include "port/pg_bitutils.h" @@ -298,6 +302,23 @@ pg_popcount(const char *buf, int bytes) uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 +#if defined(HAVE__HW_AVX512_POPCNT) + uint64 tmp[8] __attribute__((aligned(64))); + __m512i *pc_result = (__m512i *)tmp; + __m512i accumulator = _mm512_setzero_si512(); + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + _mm512_store_si512(pc_result, accumulator); + popcnt = _mm512_reduce_add_epi64(*pc_result); + bytes = bytes % 64; + +#else // HAVE__HW_AVX512_POPCNT /* Process in 64-bit chunks if the buffer is aligned. */ if (buf == (const char *) TYPEALIGN(8, buf)) { @@ -311,6 +332,7 @@ pg_popcount(const char *buf, int bytes) buf = (const char *) words; } +#endif // HAVE__HW_AVX512_POPCNT #else /* Process in 32-bit chunks if the buffer is aligned. */ if (buf == (const char *) TYPEALIGN(4, buf))