From b8905b47f7e0af8d4558fdac73cc2283c263cbcc Mon Sep 17 00:00:00 2001
From: Amit Khandekar <amitdkhan.pg@gmail.com>
Date: Thu, 10 Dec 2020 21:45:49 +0800
Subject: [PATCH 2/2] Avoid function pointer dereferencing for
 pg_popcount32/64() call.

With USE_POPCNT_ASM set (which is true only on x86), we decide with
the help of __get_cpuid() at runtime whether the CPU supports popcnt
instruction, and hence we dynamically choose the corresponding
function; thus pg_popcount32/64 is a function pointer. But for
platforms with USE_POPCNT_ASM not set, we don't have to use dynamic
assignment of functions, but we were still declaring pg_popcount64 as a
function pointer. So arrange for direct function call so as to get rid
of function pointer dereferencing each time pg_popcount32/64 is
called.

To do this, define pg_popcount64 to another function name
(pg_popcount64_nonasm) rather than a function pointer, whenever
USE_POPCNT_ASM is not defined.  And let pg_popcount64_nonasm() be a
static inline function so that whenever pg_popcount64() is called,
the __builtin_popcount() gets called directly.  For platforms not
supporting __builtin_popcount(), continue using the slow version as is
the current behaviour. pg_popcount64_slow() was actually a misnomer,
because it was actually calling __builtin_popcount() which is not a slow
function. Now with this commit, pg_popcount64_slow() indeed has only
slow code.

Tested this on ARM64, and the gist index creation for tsvectors speeds
up by ~ 6% for default siglen (=124), and by 12% with siglen=700
---
 src/include/port/pg_bitutils.h | 59 ++++++++++++++++++++++++++++++++++
 src/port/pg_bitutils.c         | 47 +++++----------------------
 2 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 174df28e66..b039f94ee5 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -23,6 +23,19 @@ extern const uint8 pg_rightmost_one_pos[256];
 extern const uint8 pg_number_of_ones[256];
 #endif
 
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to __builtin_popcount if the compiler has that,
+ * or a hand-rolled implementation if not.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define USE_POPCNT_ASM 1
+#endif
+#endif
+
 /*
  * pg_leftmost_one_pos32
  *		Returns the position of the most significant set bit in "word",
@@ -208,8 +221,54 @@ pg_ceil_log2_64(uint64 num)
 }
 
 /* Count the number of one-bits in a uint32 or uint64 */
+
+/*
+ * With USE_POPCNT_ASM set (which is true only on x86), we decide at runtime
+ * whether the CPU supports popcnt instruction, and hence we dynamically choose
+ * the corresponding function; thus pg_popcount32/64 is a function pointer. But
+ * for platforms with USE_POPCNT_ASM not set, we don't have to use dynamic
+ * assignment of functions, so we arrange for direct function call so as to get
+ * rid of function pointer dereferencing each time pg_popcount32/64 is called.
+ */
+#ifdef USE_POPCNT_ASM
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
+#else
+#define pg_popcount32 pg_popcount32_nonasm
+#define pg_popcount64 pg_popcount64_nonasm
+#endif
+
+/* Slow versions of pg_popcount */
+#ifndef HAVE__BUILTIN_POPCOUNT
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+#endif
+
+static inline int
+pg_popcount64_nonasm(uint64 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+#if defined(HAVE_LONG_INT_64)
+	return __builtin_popcountl(word);
+#elif defined(HAVE_LONG_LONG_INT_64)
+	return __builtin_popcountll(word);
+#else
+#error must have a working 64-bit integer datatype
+#endif
+#else							/* !HAVE__BUILTIN_POPCOUNT */
+	return pg_popcount64_slow(word);
+#endif							/* HAVE__BUILTIN_POPCOUNT */
+}
+
+static inline int
+pg_popcount32_nonasm(uint32 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+	return __builtin_popcount(word);
+#else
+	return pg_popcount32_slow(word);
+#endif							/* HAVE__BUILTIN_POPCOUNT */
+}
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index cb2f5f5f0b..4a220590bf 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,22 +103,6 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define USE_POPCNT_ASM 1
-#endif
-#endif
-
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
-
 #ifdef USE_POPCNT_ASM
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
@@ -128,9 +112,6 @@ static int	pg_popcount64_asm(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
 #endif							/* USE_POPCNT_ASM */
 
 #ifdef USE_POPCNT_ASM
@@ -170,8 +151,8 @@ pg_popcount32_choose(uint32 word)
 	}
 	else
 	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
+		pg_popcount32 = pg_popcount32_nonasm;
+		pg_popcount64 = pg_popcount64_nonasm;
 	}
 
 	return pg_popcount32(word);
@@ -187,8 +168,8 @@ pg_popcount64_choose(uint64 word)
 	}
 	else
 	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
+		pg_popcount32 = pg_popcount32_nonasm;
+		pg_popcount64 = pg_popcount64_nonasm;
 	}
 
 	return pg_popcount64(word);
@@ -223,16 +204,14 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* USE_POPCNT_ASM */
 
 
+#ifndef HAVE__BUILTIN_POPCOUNT
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
 	int			result = 0;
 
 	while (word != 0)
@@ -242,25 +221,15 @@ pg_popcount32_slow(uint32 word)
 	}
 
 	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
 }
 
 /*
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
 	int			result = 0;
 
 	while (word != 0)
@@ -270,8 +239,8 @@ pg_popcount64_slow(uint64 word)
 	}
 
 	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
 }
+#endif							/* HAVE__BUILTIN_POPCOUNT */
 
 
 /*
-- 
2.17.1

