From 543e9f9b8b83de2429f7019bbdbc18b2509882c9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Sun, 31 Mar 2024 22:22:15 -0500
Subject: [PATCH v22 3/3] optimize visibilitymap_count() with AVX512

---
 src/backend/access/heap/visibilitymap.c |  25 +----
 src/include/port/pg_bitutils.h          |  27 ++++-
 src/port/pg_bitutils.c                  | 143 ++++++++++++++++++++++++
 src/port/pg_popcount_avx512.c           |  25 +++++
 4 files changed, 199 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 1ab6c865e3..8b24e7bc33 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -119,10 +119,8 @@
 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 
 /* Masks for counting subsets of bits in the visibility map. */
-#define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
-														 * bit pair */
-#define FROZEN_MASK64	UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
-														 * bit pair */
+#define VISIBLE_MASK8	(0x55)	/* The lower bit of each bit pair */
+#define FROZEN_MASK8	(0xaa)	/* The upper bit of each bit pair */
 
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 	{
 		Buffer		mapBuffer;
 		uint64	   *map;
-		int			i;
 
 		/*
 		 * Read till we fall off the end of the map.  We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		 */
 		map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 
-		StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
-						 "unsupported MAPSIZE");
-		if (all_frozen == NULL)
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-		}
-		else
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-			{
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-				nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
-			}
-		}
+		nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
+		if (all_frozen)
+			nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
 
 		ReleaseBuffer(mapBuffer);
 	}
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index e4e96952b7..5d79e629c5 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num)
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
 
-/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+/* Exported for use in the AVX512 implementation. */
 extern uint64 pg_popcount_fast(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
 
 /*
  * We can also try to use the AVX512 popcount instruction on some systems.
@@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes);
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 extern bool pg_popcount_avx512_available(void);
 extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
 #else
@@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
 
 #endif							/* TRY_POPCNT_FAST */
 
@@ -357,6 +361,27 @@ pg_popcount(const char *buf, int bytes)
 	return pg_popcount_optimized(buf, bytes);
 }
 
+/*
+ * Returns the number of 1-bits in buf after applying the mask to each byte.
+ *
+ * Similar to pg_popcount(), we only take on the function pointer overhead when
+ * it's likely to be faster.
+ */
+static inline uint64
+pg_popcount_masked(const char *buf, int bytes, bits8 mask)
+{
+	if (bytes < 8)
+	{
+		uint64		popcnt = 0;
+
+		while (bytes--)
+			popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+		return popcnt;
+	}
+
+	return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
 /*
  * Rotate the bits of "word" to the right/left by n bits.
  */
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 2fa16b54b8..8beb70f62b 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -106,22 +106,26 @@ const uint8 pg_number_of_ones[256] = {
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
+static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
 
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 static uint64 pg_popcount_fast_or_avx512(const char *buf, int bytes);
+static uint64 pg_popcount_masked_fast_or_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -161,7 +165,10 @@ choose_popcount_functions(void)
 		pg_popcount_optimized = pg_popcount_fast;
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 		if (pg_popcount_avx512_available())
+		{
 			pg_popcount_optimized = pg_popcount_fast_or_avx512;
+			pg_popcount_masked_optimized = pg_popcount_masked_fast_or_avx512;
+		}
 #endif
 	}
 	else
@@ -169,6 +176,7 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_slow;
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount_optimized = pg_popcount_slow;
+		pg_popcount_masked_optimized = pg_popcount_masked_slow;
 	}
 }
 
@@ -193,6 +201,13 @@ pg_popcount_choose(const char *buf, int bytes)
 	return pg_popcount_optimized(buf, bytes);
 }
 
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+	choose_popcount_functions();
+	return pg_popcount_masked(buf, bytes, mask);
+}
+
 /*
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
@@ -291,6 +306,74 @@ pg_popcount_fast_or_avx512(const char *buf, int bytes)
 }
 #endif							/* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
+/*
+ * pg_popcount_masked_fast
+ *		Returns the number of 1-bits in buf after apply the mask to each byte
+ */
+uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_fast(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_fast(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
+/*
+ * This is a wrapper function for pg_popcount_masked_avx512() that uses
+ * pg_popcount_masked_fast() when there aren't enough bytes to fit in an
+ * AVX-512 register.  The compiler should be able to inline
+ * pg_popcount_masked_fast() so that we only take on additional function call
+ * overhead when it's likely to be a better option.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+static uint64
+pg_popcount_masked_fast_or_avx512(const char *buf, int bytes, bits8 mask)
+{
+	if (bytes < 64)
+		return pg_popcount_masked_fast(buf, bytes, mask);
+	else
+		return pg_popcount_masked_avx512(buf, bytes, mask);
+}
+#endif							/* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
+
 #endif							/* TRY_POPCNT_FAST */
 
 
@@ -390,6 +473,56 @@ pg_popcount_slow(const char *buf, int bytes)
 	return popcnt;
 }
 
+/*
+ * pg_popcount_masked_slow
+ *		Returns the number of 1-bits in buf after apply the mask to each byte
+ */
+static uint64
+pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_slow(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_slow(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
 #ifndef TRY_POPCNT_FAST
 
 /*
@@ -421,4 +554,14 @@ pg_popcount_optimized(const char *buf, int bytes)
 	return pg_popcount_slow(buf, bytes);
 }
 
+/*
+ * pg_popcount_masked_optimized
+ *		Returns the number of 1-bits in buf after apply the mask to each byte
+ */
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+	return pg_popcount_masked_slow(buf, bytes, mask);
+}
+
 #endif							/* !TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index f86558d1ee..fb9ab3313b 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -46,4 +46,29 @@ pg_popcount_avx512(const char *buf, int bytes)
 	return popcnt + pg_popcount_fast(buf, bytes);
 }
 
+/*
+ * pg_popcount_masked_avx512
+ *		Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+uint64
+pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+	const		__m512i maskv = _mm512_set1_epi8(mask);
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i vmasked = _mm512_and_si512(val, maskv);
+		const		__m512i cnt = _mm512_popcnt_epi64(vmasked);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_masked_fast(buf, bytes, mask);
+}
+
 #endif							/* TRY_POPCNT_FAST */
-- 
2.25.1

