call popcount32/64 directly on non-x86 platforms

Started by John Naylorover 4 years ago11 messages
#1John Naylor
john.naylor@enterprisedb.com
3 attachment(s)

Currently, all platforms must indirect through a function pointer to call
popcount on a word-sized input, even though we don't arrange for a fast
implementation on non-x86 to make it worthwhile.

0001 moves some declarations around so that "slow" popcount functions are
called directly on non-x86 platforms.

0002 was an idea to simplify and unify the coding for the slow functions.

Also attached is a test module for building microbenchmarks.

On a Power8 machine using gcc 4.8, and running
time ./inst/bin/psql -c 'select drive_popcount(100000, 1024)'

I get

master: 647ms
0001: 183ms
0002: 228ms

So 0001 is a clear winner on that platform. 0002 is still good, but slower
than 0001 for some reason, and it turns out that on master, gcc does emit a
popcnt instruction from the intrinsic:

0000000000000000 <pg_popcount32_slow>:
0: f4 02 63 7c popcntw r3,r3
4: b4 07 63 7c extsw r3,r3
8: 20 00 80 4e blr
...

The gcc docs mention a flag for this, but I'm not sure why it seems not to
need it:

https://gcc.gnu.org/onlinedocs/gcc/RS_002f6000-and-PowerPC-Options.html#RS_002f6000-and-PowerPC-Options

Maybe that's because the machine I used was ppc64le, but I'm not sure a ppc
binary built like this is portable to other hardware. For that reason,
maybe 0002 is a good idea.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

popcount-test-module.patchapplication/x-patch; name=popcount-test-module.patchDownload
diff --git a/src/test/modules/test_popcount/Makefile b/src/test/modules/test_popcount/Makefile
new file mode 100644
index 0000000000..8a49d1ccfc
--- /dev/null
+++ b/src/test/modules/test_popcount/Makefile
@@ -0,0 +1,21 @@
+MODULE_big = test_popcount
+OBJS = test_popcount.o
+PGFILEDESC = "test"
+EXTENSION = test_popcount
+DATA = test_popcount--1.0.sql
+
+first: all
+
+# needed?
+test_popcount.o: test_popcount.c
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_popcount
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_popcount/test_popcount--1.0.sql b/src/test/modules/test_popcount/test_popcount--1.0.sql
new file mode 100644
index 0000000000..a5975e54cd
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount--1.0.sql
@@ -0,0 +1,2 @@
+CREATE FUNCTION drive_popcount  (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
+CREATE FUNCTION drive_popcount64(count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_popcount/test_popcount.c b/src/test/modules/test_popcount/test_popcount.c
new file mode 100644
index 0000000000..0030ed142f
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.c
@@ -0,0 +1,71 @@
+/* select drive_popcount(1000000, 1024); */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+
+#include "port/pg_bitutils.h"
+
+PG_MODULE_MAGIC;
+
+#ifndef PG_POPCOUNT64
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+#endif
+
+
+/*
+ * drive_popcount64(count int, num int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount64);
+Datum
+drive_popcount64(PG_FUNCTION_ARGS)
+{
+
+	int			count = PG_GETARG_INT32(0);
+	int			num   = PG_GETARG_INT32(1);
+
+	int			len = num * sizeof(uint64);
+	uint64 	   *words	= palloc(len);
+	int64		popcount;
+
+	for (int i = 0; i < num; i++)
+		words[i] = i;
+
+	while (count--)
+	{
+		popcount = 0;
+		for (int i = 0; i < num; i++)
+			popcount += PG_POPCOUNT64(words[i]);
+	}
+
+	PG_RETURN_INT64(popcount);
+}
+
+/*
+ * drive_popcount(count int, len int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount);
+Datum
+drive_popcount(PG_FUNCTION_ARGS)
+{
+
+	int			count = PG_GETARG_INT32(0);
+	int			num   = PG_GETARG_INT32(1);
+
+	int			len = num * sizeof(uint64);
+	uint64	   *words	= palloc(len);
+	int64		popcount;
+
+	for (int i = 0; i < num; i++)
+		words[i] = i;
+
+	while (count--)
+		popcount = pg_popcount((const char*) words, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git a/src/test/modules/test_popcount/test_popcount.control b/src/test/modules/test_popcount/test_popcount.control
new file mode 100644
index 0000000000..6837d724b4
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_popcount'
+relocatable = true
v1-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchapplication/x-patch; name=v1-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchDownload
From f65964c997cf8958dd9e53a04fb7e92098c784f1 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 11 Aug 2021 12:17:51 -0400
Subject: [PATCH v1 1/2] Use direct function calls for pg_popcount{32,64} on
 non-x86 platforms

Previously, all pg_popcount{32,64} calls were indirected through
a function pointer, even though we had no fast implementation for
non-x86 platforms. To fix, use macros to cover both the direct and
indirect cases, and define them appropriately similar to the CRC code.
---
 src/backend/access/heap/visibilitymap.c |  6 ++--
 src/backend/nodes/bitmapset.c           |  4 +--
 src/include/port/pg_bitutils.h          | 42 +++++++++++++++++++++-
 src/port/pg_bitutils.c                  | 46 ++++---------------------
 4 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd30..75b0377807 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -411,14 +411,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		if (all_frozen == NULL)
 		{
 			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
+				nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
 		}
 		else
 		{
 			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
 			{
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-				nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
+				nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
+				nfrozen += PG_POPCOUNT64(map[i] & FROZEN_MASK64);
 			}
 		}
 
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 649478b0d4..a6c4c62914 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -57,11 +57,11 @@
 #if BITS_PER_BITMAPWORD == 32
 #define bmw_leftmost_one_pos(w)		pg_leftmost_one_pos32(w)
 #define bmw_rightmost_one_pos(w)	pg_rightmost_one_pos32(w)
-#define bmw_popcount(w)				pg_popcount32(w)
+#define bmw_popcount(w)				PG_POPCOUNT32(w)
 #elif BITS_PER_BITMAPWORD == 64
 #define bmw_leftmost_one_pos(w)		pg_leftmost_one_pos64(w)
 #define bmw_rightmost_one_pos(w)	pg_rightmost_one_pos64(w)
-#define bmw_popcount(w)				pg_popcount64(w)
+#define bmw_popcount(w)				PG_POPCOUNT64(w)
 #else
 #error "invalid BITS_PER_BITMAPWORD"
 #endif
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 086bd08132..ea97192b3a 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -253,9 +253,49 @@ pg_ceil_log2_64(uint64 num)
 		return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-/* Count the number of one-bits in a uint32 or uint64 */
+/*
+ * With MSVC on x86_64 builds, try using native popcnt instructions via the
+ * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
+ * __builtin_popcount* intrinsic functions as they always emit popcnt
+ * instructions.
+ */
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define HAVE_X86_64_POPCNTQ
+#endif
+
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to a hand-rolled implementation.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define TRY_POPCNT_FAST 1
+#endif
+#endif
+
+#ifdef TRY_POPCNT_FAST
+/* Use the POPCNT instruction, but perform a runtime check first. */
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
+extern int	pg_popcount32_fast(uint32 word);
+extern int	pg_popcount64_fast(uint64 word);
+
+#else
+/* Use a portable implementation */
+#define PG_POPCOUNT32(word) pg_popcount32_slow(word)
+#define PG_POPCOUNT64(word) pg_popcount64_slow(word)
+
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
+
+#endif							/* TRY_POPCNT_FAST */
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 10676b285c..3e90de5249 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,47 +103,13 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
-#endif
-#endif
-
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
-
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
-static int	pg_popcount32_fast(uint32 word);
-static int	pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
-#endif							/* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
 
 /*
  * Return true if CPUID indicates that the POPCNT instruction is available.
@@ -208,7 +174,7 @@ pg_popcount64_choose(uint64 word)
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_fast(uint32 word)
 {
 #ifdef _MSC_VER
@@ -225,7 +191,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount64_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_fast(uint64 word)
 {
 #ifdef _MSC_VER
@@ -245,7 +211,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -267,7 +233,7 @@ pg_popcount32_slow(uint32 word)
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -309,7 +275,7 @@ pg_popcount(const char *buf, int bytes)
 
 		while (bytes >= 8)
 		{
-			popcnt += pg_popcount64(*words++);
+			popcnt += PG_POPCOUNT64(*words++);
 			bytes -= 8;
 		}
 
@@ -323,7 +289,7 @@ pg_popcount(const char *buf, int bytes)
 
 		while (bytes >= 4)
 		{
-			popcnt += pg_popcount32(*words++);
+			popcnt += PG_POPCOUNT32(*words++);
 			bytes -= 4;
 		}
 
-- 
2.31.1

v1-0002-Replace-intrinsics-in-pg_popcount-_slow-with-pure.patchapplication/x-patch; name=v1-0002-Replace-intrinsics-in-pg_popcount-_slow-with-pure.patchDownload
From 225073b1741fd89ec7728e28978e057e9dc11116 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 11 Aug 2021 12:22:24 -0400
Subject: [PATCH v1 2/2] Replace intrinsics in pg_popcount*_slow with pure C
 code

Intrinsics are used in the hope that the compiler will access some fast
hardware implementation where available. However, on x86 at least,
__builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt
wasn't passed to the compiler. Instead, the compiler emitted bitwise
operations where the intrinsic was supported. Where not supported,
we used a byte-at-a-time loop using a lookup table.

Since the *slow functions are fallback implementations, replace the
intrinsics and the associated #ifdef maze with the bitwise operations
written in C so all platforms can benefit from them.

If we ever get configure support for x86-64-v2, we could use these
intrinsics to emit the POPCNT instruction without a runtime check. To
allow for that possibility, let's keep the configure checks around.
---
 src/port/pg_bitutils.c | 47 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 3e90de5249..500372db10 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* TRY_POPCNT_FAST */
 
 
+/*
+ * The *_slow implementations are based on
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
@@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & 0x55555555);
+	word = (word & 0x33333333) +
+		((word >> 2) & 0x33333333);
+	word = (word + (word >> 4)) & 0xF0F0F0F;
+	return (int) ((word * 0x1010101) >> 24);
 }
 
 /*
@@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word)
 int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & UINT64CONST(0x5555555555555555));
+	word = (word & UINT64CONST(0x3333333333333333)) +
+		((word >> 2) & UINT64CONST(0x3333333333333333));
+	word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F);
+	return (int) ((word * UINT64CONST(0x101010101010101)) >> 56);
 }
 
 
-- 
2.31.1

#2David Rowley
dgrowleyml@gmail.com
In reply to: John Naylor (#1)
Re: call popcount32/64 directly on non-x86 platforms

On Thu, 12 Aug 2021 at 05:11, John Naylor <john.naylor@enterprisedb.com> wrote:

0001 moves some declarations around so that "slow" popcount functions are called directly on non-x86 platforms.

I was wondering if there was a reason that you didn't implement this
by just changing pg_popcount32 and pg_popcount64 to be actual
functions rather than function pointers when TRY_POPCNT_FAST is not
defined? These functions would then just return
pg_popcountNN_slow(word);

This would save from having to change all the current callers of the
functions to use the macro instead. That might be nice for any
extensions which are using these functions.

David

#3John Naylor
john.naylor@enterprisedb.com
In reply to: David Rowley (#2)
2 attachment(s)
Re: call popcount32/64 directly on non-x86 platforms

On Wed, Aug 11, 2021 at 8:13 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Thu, 12 Aug 2021 at 05:11, John Naylor <john.naylor@enterprisedb.com>

wrote:

0001 moves some declarations around so that "slow" popcount functions

are called directly on non-x86 platforms.

I was wondering if there was a reason that you didn't implement this
by just changing pg_popcount32 and pg_popcount64 to be actual
functions rather than function pointers when TRY_POPCNT_FAST is not
defined? These functions would then just return
pg_popcountNN_slow(word);

This would save from having to change all the current callers of the
functions to use the macro instead. That might be nice for any
extensions which are using these functions.

Hmm, it wasn't obvious to me that would work, but I tried it and came up
with v2. Is this what you had in mind?

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v2-0002-Replace-intrinsics-in-pg_popcount-32-64-_slow-wit.patchapplication/octet-stream; name=v2-0002-Replace-intrinsics-in-pg_popcount-32-64-_slow-wit.patchDownload
From 8d54057e0a0901c99b4d06eeb4c3bc2e8dad8f19 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 11 Aug 2021 21:37:00 -0400
Subject: [PATCH v2 2/2] Replace intrinsics in pg_popcount{32,64}_slow with
 pure C code

Intrinsics are used in the hope that the compiler will access some fast
hardware implementation where available. However, on x86 at least,
__builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt
wasn't passed to the compiler. Instead, the compiler emitted bitwise
operations where the intrinsic was supported. Where not supported,
we used a byte-at-a-time loop using a lookup table.

Since the *_slow functions are fallback implementations, replace the
intrinsics and the associated #ifdef maze with the bitwise operations
written in C so all platforms can benefit from them.

If we ever get configure support for x86-64-v2, we could use these
intrinsics to emit the POPCNT instruction without a runtime check. To
allow for that possibility, let's keep the configure checks around.
---
 src/port/pg_bitutils.c | 47 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index b5a62def6f..0d6a0cd5d8 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* TRY_POPCNT_FAST */
 
 
+/*
+ * The *_slow implementations are based on
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
@@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & 0x55555555);
+	word = (word & 0x33333333) +
+		((word >> 2) & 0x33333333);
+	word = (word + (word >> 4)) & 0xF0F0F0F;
+	return (int) ((word * 0x1010101) >> 24);
 }
 
 /*
@@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word)
 int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & UINT64CONST(0x5555555555555555));
+	word = (word & UINT64CONST(0x3333333333333333)) +
+		((word >> 2) & UINT64CONST(0x3333333333333333));
+	word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F);
+	return (int) ((word * UINT64CONST(0x101010101010101)) >> 56);
 }
 
 #ifndef TRY_POPCNT_FAST
-- 
2.31.1

v2-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchapplication/octet-stream; name=v2-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchDownload
From 9d25c79719859b87d2137b0cd888d1f1a7c4d3cc Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 11 Aug 2021 21:26:59 -0400
Subject: [PATCH v2 1/2] Use direct function calls for pg_popcount{32,64} on
 non-x86 platforms

Previously, all pg_popcount{32,64} calls were indirected through
a function pointer, even though we had no fast implementation for
non-x86 platforms. Instead, use wrappers around the
pg_popcount{32,64}_slow functions.
---
 src/include/port/pg_bitutils.h | 38 +++++++++++++++++++++-
 src/port/pg_bitutils.c         | 58 ++++++++++++----------------------
 2 files changed, 57 insertions(+), 39 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 086bd08132..21db364984 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -253,9 +253,45 @@ pg_ceil_log2_64(uint64 num)
 		return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-/* Count the number of one-bits in a uint32 or uint64 */
+/*
+ * With MSVC on x86_64 builds, try using native popcnt instructions via the
+ * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
+ * __builtin_popcount* intrinsic functions as they always emit popcnt
+ * instructions.
+ */
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define HAVE_X86_64_POPCNTQ
+#endif
+
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to a hand-rolled implementation.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define TRY_POPCNT_FAST 1
+#endif
+#endif
+
+#ifdef TRY_POPCNT_FAST
+/* Use the POPCNT instruction, but perform a runtime check first. */
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
+extern int	pg_popcount32_fast(uint32 word);
+extern int	pg_popcount64_fast(uint64 word);
+
+#else
+/* Use a portable implementation. */
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
+extern int	pg_popcount32(uint32 word);
+extern int	pg_popcount64(uint64 word);
+
+#endif							/* TRY_POPCNT_FAST */
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 10676b285c..b5a62def6f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,47 +103,13 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
-#endif
-#endif
-
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
-
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
-static int	pg_popcount32_fast(uint32 word);
-static int	pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
-#endif							/* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
 
 /*
  * Return true if CPUID indicates that the POPCNT instruction is available.
@@ -208,7 +174,7 @@ pg_popcount64_choose(uint64 word)
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_fast(uint32 word)
 {
 #ifdef _MSC_VER
@@ -225,7 +191,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount64_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_fast(uint64 word)
 {
 #ifdef _MSC_VER
@@ -245,7 +211,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -267,7 +233,7 @@ pg_popcount32_slow(uint32 word)
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -291,6 +257,22 @@ pg_popcount64_slow(uint64 word)
 #endif							/* HAVE__BUILTIN_POPCOUNT */
 }
 
+#ifndef TRY_POPCNT_FAST
+
+/* call these directly rather than through a function pointer */
+int
+pg_popcount32(uint32 word)
+{
+	return pg_popcount32_slow(word);
+}
+
+int
+pg_popcount64(uint64 word)
+{
+	return pg_popcount64_slow(word);
+}
+
+#endif							/* !TRY_POPCNT_FAST */
 
 /*
  * pg_popcount
-- 
2.31.1

#4David Rowley
dgrowleyml@gmail.com
In reply to: John Naylor (#3)
1 attachment(s)
Re: call popcount32/64 directly on non-x86 platforms

On Thu, 12 Aug 2021 at 14:02, John Naylor <john.naylor@enterprisedb.com> wrote:

On Wed, Aug 11, 2021 at 8:13 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Thu, 12 Aug 2021 at 05:11, John Naylor <john.naylor@enterprisedb.com> wrote:

0001 moves some declarations around so that "slow" popcount functions are called directly on non-x86 platforms.

I was wondering if there was a reason that you didn't implement this
by just changing pg_popcount32 and pg_popcount64 to be actual
functions rather than function pointers when TRY_POPCNT_FAST is not
defined? These functions would then just return
pg_popcountNN_slow(word);

This would save from having to change all the current callers of the
functions to use the macro instead. That might be nice for any
extensions which are using these functions.

Hmm, it wasn't obvious to me that would work, but I tried it and came up with v2. Is this what you had in mind?

Closer, but I don't see why there's any need to make the fast and slow
functions external. It should be perfectly fine to keep them static.

I didn't test the performance, but the attached works for me.

Going by https://godbolt.org/z/ocv1Kj5K4 f2 seems to inline f1

David

Attachments:

v3-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchapplication/octet-stream; name=v3-0001-Use-direct-function-calls-for-pg_popcount-32-64-o.patchDownload
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 086bd08132..8218554ef3 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -253,10 +253,40 @@ pg_ceil_log2_64(uint64 num)
 		return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-/* Count the number of one-bits in a uint32 or uint64 */
+/*
+ * With MSVC on x86_64 builds, try using native popcnt instructions via the
+ * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
+ * __builtin_popcount* intrinsic functions as they always emit popcnt
+ * instructions.
+ */
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define HAVE_X86_64_POPCNTQ
+#endif
+
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to a hand-rolled implementation.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define TRY_POPCNT_FAST 1
+#endif
+#endif
+
+#ifdef TRY_POPCNT_FAST
+/* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
 
+#else
+/* Use a portable implementation. No need for the function pointer */
+extern int	pg_popcount32(uint32 word);
+extern int	pg_popcount64(uint64 word);
+
+#endif							/* TRY_POPCNT_FAST */
+
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
 
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 10676b285c..40c9c907a3 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,29 +103,6 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
-#endif
-#endif
-
 static int	pg_popcount32_slow(uint32 word);
 static int	pg_popcount64_slow(uint64 word);
 
@@ -138,9 +115,6 @@ static int	pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -291,6 +265,28 @@ pg_popcount64_slow(uint64 word)
 #endif							/* HAVE__BUILTIN_POPCOUNT */
 }
 
+#ifndef TRY_POPCNT_FAST
+
+/*
+ * When the POPCNT instruction is not available, there's no point in using
+ * function pointers to vary the implementation between the fast and slow
+ * method.  We instead just make these actual external functions when
+ * TRY_POPCNT_FAST is not defined.  The compiler should be able to inline
+ * the slow versions here.
+ */
+int
+pg_popcount32(uint32 word)
+{
+	return pg_popcount32_slow(word);
+}
+
+int
+pg_popcount64(uint64 word)
+{
+	return pg_popcount64_slow(word);
+}
+
+#endif							/* TRY_POPCNT_FAST */
 
 /*
  * pg_popcount
#5Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: David Rowley (#4)
Re: call popcount32/64 directly on non-x86 platforms

So when on MSVC, you don't have to check CPUID for support?

--
Álvaro Herrera PostgreSQL Developer — https://www.EnterpriseDB.com/
"La primera ley de las demostraciones en vivo es: no trate de usar el sistema.
Escriba un guión que no toque nada para no causar daños." (Jakob Nielsen)

#6David Rowley
dgrowleyml@gmail.com
In reply to: Alvaro Herrera (#5)
Re: call popcount32/64 directly on non-x86 platforms

On Fri, 13 Aug 2021 at 01:11, Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:

So when on MSVC, you don't have to check CPUID for support?

That still needs to be checked in MSVC and as far as I can see it is
being properly checked.

David

#7David Rowley
dgrowleyml@gmail.com
In reply to: David Rowley (#6)
Re: call popcount32/64 directly on non-x86 platforms

On Fri, 13 Aug 2021 at 01:28, David Rowley <dgrowleyml@gmail.com> wrote:

On Fri, 13 Aug 2021 at 01:11, Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:

So when on MSVC, you don't have to check CPUID for support?

That still needs to be checked in MSVC and as far as I can see it is
being properly checked.

Something there that might cause confusion is we do a configure check
to see if popcntq works and define HAVE_X86_64_POPCNTQ if it does.
I'm still a bit confused at why we bother doing that. Surely it just
means that if the build machine does not have popcntq that we'll
always use pg_popcount64_slow, regardless if the machine that's
actually running the code has popcntq.

Maybe you saw that there's no such equivalent test when we set
HAVE_X86_64_POPCNTQ for MSVC on x86_64. The reason for that is that
we do the run-time test using cpuid.

David

#8John Naylor
john.naylor@enterprisedb.com
In reply to: David Rowley (#7)
Re: call popcount32/64 directly on non-x86 platforms

On Thu, Aug 12, 2021 at 9:33 AM David Rowley <dgrowleyml@gmail.com> wrote:

Something there that might cause confusion is we do a configure check
to see if popcntq works and define HAVE_X86_64_POPCNTQ if it does.
I'm still a bit confused at why we bother doing that. Surely it just
means that if the build machine does not have popcntq that we'll
always use pg_popcount64_slow, regardless if the machine that's
actually running the code has popcntq.

Yeah, it's a bit strange, a configure check makes more sense if we have a
way to specify we can build with a direct call (like x86-64-v2), but we
don't right now. Maybe short-term we could always do the runtime check on
x86-64.

--
John Naylor
EDB: http://www.enterprisedb.com

#9Alvaro Herrera
alvherre@alvh.no-ip.org
In reply to: David Rowley (#7)
Re: call popcount32/64 directly on non-x86 platforms

On 2021-Aug-13, David Rowley wrote:

Maybe you saw that there's no such equivalent test when we set
HAVE_X86_64_POPCNTQ for MSVC on x86_64. The reason for that is that
we do the run-time test using cpuid.

Yeah, that and also I mistook the two independent "ifdef" blocks for one
block with an "#else". Re-reading the ifdef maze, it looks reasonable.

--
Álvaro Herrera 39°49'30"S 73°17'W — https://www.EnterpriseDB.com/

#10John Naylor
john.naylor@enterprisedb.com
In reply to: David Rowley (#4)
Re: call popcount32/64 directly on non-x86 platforms

On Thu, Aug 12, 2021 at 1:26 AM David Rowley <dgrowleyml@gmail.com> wrote:

Closer, but I don't see why there's any need to make the fast and slow
functions external. It should be perfectly fine to keep them static.

I didn't test the performance, but the attached works for me.

Thanks for that! I still get a big improvement to on Power8 / gcc 4.8, but
it's not quite as fast as earlier versions, which were around 200ms:

master: 646ms
v3: 312ms

This machine does seem to be pickier about code layout than any other I've
tried running benchmarks on, but that's still a bit much. In any case, your
version is clearer and has the intended effect, so I plan to commit that,
barring other comments.

I think I'll leave my v2-0002 aside for now, since it has wider
implications, and I have bigger things to work on.

--
John Naylor
EDB: http://www.enterprisedb.com

#11John Naylor
john.naylor@enterprisedb.com
In reply to: John Naylor (#10)
Re: call popcount32/64 directly on non-x86 platforms

I wrote:

On Thu, Aug 12, 2021 at 1:26 AM David Rowley <dgrowleyml@gmail.com> wrote:

Closer, but I don't see why there's any need to make the fast and slow
functions external. It should be perfectly fine to keep them static.

I didn't test the performance, but the attached works for me.

Thanks for that! I still get a big improvement to on Power8 / gcc 4.8,

but it's not quite as fast as earlier versions, which were around 200ms:

master: 646ms
v3: 312ms

This machine does seem to be pickier about code layout than any other

I've tried running benchmarks on, but that's still a bit much. In any case,
your version is clearer and has the intended effect, so I plan to commit
that, barring other comments.

Pushed, thanks for looking1

--
John Naylor
EDB: http://www.enterprisedb.com