From 8ea529dd315723ca3e8ad4243853148da23f1202 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Sun, 31 Mar 2024 22:22:15 -0500
Subject: [PATCH v19 4/4] optimize visibilitymap_count() with AVX512

---
 src/backend/access/heap/visibilitymap.c |  25 ++----
 src/include/port/pg_bitutils.h          |   6 +-
 src/port/pg_bitutils.c                  | 113 ++++++++++++++++++++++++
 src/port/pg_popcount_avx512.c           |  21 +++++
 4 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 1ab6c865e3..8b24e7bc33 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -119,10 +119,8 @@
 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 
 /* Masks for counting subsets of bits in the visibility map. */
-#define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
-														 * bit pair */
-#define FROZEN_MASK64	UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
-														 * bit pair */
+#define VISIBLE_MASK8	(0x55)	/* The lower bit of each bit pair */
+#define FROZEN_MASK8	(0xaa)	/* The upper bit of each bit pair */
 
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 	{
 		Buffer		mapBuffer;
 		uint64	   *map;
-		int			i;
 
 		/*
 		 * Read till we fall off the end of the map.  We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		 */
 		map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 
-		StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
-						 "unsupported MAPSIZE");
-		if (all_frozen == NULL)
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-		}
-		else
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-			{
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-				nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
-			}
-		}
+		nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
+		if (all_frozen)
+			nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
 
 		ReleaseBuffer(mapBuffer);
 	}
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 1a92c56bcd..16145c746e 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num)
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask);
 
-/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+/* Exported for use in the AVX512 implementation. */
 extern uint64 pg_popcount_fast(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
 
 /*
  * We can also try to use the AVX512 popcount instruction on some systems.
@@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes);
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 extern bool pg_popcount_avx512_available(void);
 extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
 #else
@@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
 extern uint64 pg_popcount(const char *buf, int bytes);
+extern uint64 pg_popcount_masked(const char *buf, int bytes);
 
 #endif							/* TRY_POPCNT_FAST */
 
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 177509518f..902ecdebbf 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -106,18 +106,21 @@ const uint8 pg_number_of_ones[256] = {
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
+static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_masked) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -155,9 +158,13 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+		pg_popcount_masked = pg_popcount_masked_fast;
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 		if (pg_popcount_avx512_available())
+		{
 			pg_popcount = pg_popcount_avx512;
+			pg_popcount_masked = pg_popcount_masked_avx512;
+		}
 #endif
 	}
 	else
@@ -165,6 +172,7 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_slow;
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
+		pg_popcount_masked = pg_popcount_masked_slow;
 	}
 }
 
@@ -189,6 +197,13 @@ pg_popcount_choose(const char *buf, int bytes)
 	return pg_popcount(buf, bytes);
 }
 
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+	choose_popcount_functions();
+	return pg_popcount_masked(buf, bytes, mask);
+}
+
 /*
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
@@ -269,6 +284,52 @@ pg_popcount_fast(const char *buf, int bytes)
 	return popcnt;
 }
 
+uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_fast(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_fast(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
 #endif							/* TRY_POPCNT_FAST */
 
 
@@ -368,6 +429,52 @@ pg_popcount_slow(const char *buf, int bytes)
 	return popcnt;
 }
 
+static uint64
+pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_slow(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_slow(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
 #ifndef TRY_POPCNT_FAST
 
 /*
@@ -399,4 +506,10 @@ pg_popcount(const char *buf, int bytes)
 	return pg_popcount_slow(buf, bytes);
 }
 
+uint64
+pg_popcount_masked(const char *buf, int bytes, bits8 mask)
+{
+	return pg_popcount_masked_slow(buf, bytes, mask);
+}
+
 #endif							/* !TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index f86558d1ee..8965a8d530 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -46,4 +46,25 @@ pg_popcount_avx512(const char *buf, int bytes)
 	return popcnt + pg_popcount_fast(buf, bytes);
 }
 
+uint64
+pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+	const		__m512i maskv = _mm512_set1_epi8(mask);
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i vmasked = _mm512_and_si512(val, maskv);
+		const		__m512i cnt = _mm512_popcnt_epi64(vmasked);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_masked_fast(buf, bytes, mask);
+}
+
 #endif							/* TRY_POPCNT_FAST */
-- 
2.25.1

